diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..afc0d1e2 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-07-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2404.01399v3","updated":"2024-07-01T17:40:13Z","published":"2024-04-01T18:10:05Z","title":"Safe and Responsible Large Language Model : Can We Balance Bias\n Reduction and Language Understanding in Large Language Models?","summary":" Large Language Models (LLMs) have significantly advanced various NLP tasks.\nHowever, these models often risk generating unsafe text that perpetuates\nbiases. Current approaches to produce unbiased outputs from LLMs can reduce\nbiases but at the expense of knowledge retention. In this research, we address\nthe question of whether producing safe (unbiased) outputs through LLMs can\nretain knowledge and language understanding. In response, we developed the\nSafety and Responsible Large Language Model (\\textbf{SR}$_{\\text{LLM}}$), an\nLLM that has been instruction fine-tuned on top of already safe LLMs (e.g.,\nLlama2 or related) to diminish biases in generated text. To achieve our goals,\nwe compiled a specialized dataset designed to train our model in identifying\nand correcting biased text. We conduct experiments, both on this custom data\nand out-of-distribution test sets, to show the bias reduction and knowledge\nretention. The results confirm that \\textbf{SR}$_{\\text{LLM}}$ outperforms\ntraditional fine-tuning and prompting methods in both reducing biases and\npreserving the integrity of language knowledge. The significance of our\nfindings lies in demonstrating that instruction fine-tuning can provide a more\nrobust solution for bias reduction in LLMs. We have made our code and data\navailable at\n\\href{https://github.com/shainarazavi/Safe-Responsible-LLM}{Safe-LLM}.\n","authors":["Shaina Raza","Oluwanifemi Bamgbose","Shardul Ghuge","Fatemeh Tavakol","Deepak John Reji","Syed Raza Bashir"],"pdf_url":"https://arxiv.org/pdf/2404.01399v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17055v2","updated":"2024-07-01T17:29:54Z","published":"2024-06-24T18:15:27Z","title":"Large Language Models Assume People are More Rational than We Really are","summary":" In order for AI systems to communicate effectively with people, they must\nunderstand how we make decisions. However, people's decisions are not always\nrational, so the implicit internal models of human decision-making in Large\nLanguage Models (LLMs) must account for this. Previous empirical evidence seems\nto suggest that these implicit models are accurate -- LLMs offer believable\nproxies of human behavior, acting how we expect humans would in everyday\ninteractions. However, by comparing LLM behavior and predictions to a large\ndataset of human decisions, we find that this is actually not the case: when\nboth simulating and predicting people's choices, a suite of cutting-edge LLMs\n(GPT-4o & 4-Turbo, Llama-3-8B & 70B, Claude 3 Opus) assume that people are more\nrational than we really are. Specifically, these models deviate from human\nbehavior and align more closely with a classic model of rational choice --\nexpected value theory. Interestingly, people also tend to assume that other\npeople are rational when interpreting their behavior. As a consequence, when we\ncompare the inferences that LLMs and people draw from the decisions of others\nusing another psychological dataset, we find that these inferences are highly\ncorrelated. Thus, the implicit decision-making models of LLMs appear to be\naligned with the human expectation that other people will act rationally,\nrather than with how people actually act.\n","authors":["Ryan Liu","Jiayi Geng","Joshua C. Peterson","Ilia Sucholutsky","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2406.17055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08390v2","updated":"2024-07-01T16:57:56Z","published":"2023-11-14T18:51:38Z","title":"Predicting Text Preference Via Structured Comparative Reasoning","summary":" Comparative reasoning plays a crucial role in text preference prediction;\nhowever, large language models (LLMs) often demonstrate inconsistencies in\ntheir reasoning. While approaches like Chain-of-Thought improve accuracy in\nmany other settings, they struggle to consistently distinguish the similarities\nand differences of complex texts. We introduce SC, a prompting approach that\npredicts text preferences by generating structured intermediate comparisons. SC\nbegins by proposing aspects of comparison, followed by generating textual\ncomparisons under each aspect. We select consistent comparisons with a pairwise\nconsistency comparator that ensures each aspect's comparisons clearly\ndistinguish differences between texts, significantly reducing hallucination and\nimproving consistency. Our comprehensive evaluations across various NLP tasks,\nincluding summarization, retrieval, and automatic rating, demonstrate that SC\nequips LLMs to achieve state-of-the-art performance in text preference\nprediction.\n","authors":["Jing Nathan Yan","Tianqi Liu","Justin T Chiu","Jiaming Shen","Zhen Qin","Yue Yu","Yao Zhao","Charu Lakshmanan","Yair Kurzion","Alexander M. Rush","Jialu Liu","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2311.08390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05196v3","updated":"2024-07-01T16:36:30Z","published":"2023-09-11T02:16:47Z","title":"Does Writing with Language Models Reduce Content Diversity?","summary":" Large language models (LLMs) have led to a surge in collaborative writing\nwith model assistance. As different users incorporate suggestions from the same\nmodel, there is a risk of decreased diversity in the produced content,\npotentially limiting diverse perspectives in public discourse. In this work, we\nmeasure the impact of co-writing on diversity via a controlled experiment,\nwhere users write argumentative essays in three setups -- using a base LLM\n(GPT3), a feedback-tuned LLM (InstructGPT), and writing without model help. We\ndevelop a set of diversity metrics and find that writing with InstructGPT (but\nnot the GPT3) results in a statistically significant reduction in diversity.\nSpecifically, it increases the similarity between the writings of different\nauthors and reduces the overall lexical and content diversity. We additionally\nfind that this effect is mainly attributable to InstructGPT contributing less\ndiverse text to co-written essays. In contrast, the user-contributed text\nremains unaffected by model collaboration. This suggests that the recent\nimprovement in generation quality from adapting models to human feedback might\ncome at the cost of more homogeneous and less diverse content.\n","authors":["Vishakh Padmakumar","He He"],"pdf_url":"https://arxiv.org/pdf/2309.05196v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2406.05615v2","updated":"2024-07-01T16:05:01Z","published":"2024-06-09T02:36:28Z","title":"Video-Language Understanding: A Survey from Model Architecture, Model\n Training, and Data Perspectives","summary":" Humans use multiple senses to comprehend the environment. Vision and language\nare two of the most vital senses since they allow us to easily communicate our\nthoughts and perceive the world around us. There has been a lot of interest in\ncreating video-language understanding systems with human-like senses since a\nvideo-language pair can mimic both our linguistic medium and visual environment\nwith temporal dynamics. In this survey, we review the key tasks of these\nsystems and highlight the associated challenges. Based on the challenges, we\nsummarize their methods from model architecture, model training, and data\nperspectives. We also conduct performance comparison among the methods, and\ndiscuss promising directions for future research.\n","authors":["Thong Nguyen","Yi Bin","Junbin Xiao","Leigang Qu","Yicong Li","Jay Zhangjie Wu","Cong-Duy Nguyen","See-Kiong Ng","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2406.05615v2.pdf","comment":"Accepted at ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2404.18510v2","updated":"2024-07-01T15:58:11Z","published":"2024-04-29T08:52:52Z","title":"Explainability of machine learning approaches in forensic linguistics: a\n case study in geolinguistic authorship profiling","summary":" Forensic authorship profiling uses linguistic markers to infer\ncharacteristics about an author of a text. This task is paralleled in dialect\nclassification, where a prediction is made about the linguistic variety of a\ntext based on the text itself. While there have been significant advances in\nrecent years in variety classification, forensic linguistics rarely relies on\nthese approaches due to their lack of transparency, among other reasons. In\nthis paper we therefore explore the explainability of machine learning\napproaches considering the forensic context. We focus on variety classification\nas a means of geolinguistic profiling of unknown texts based on social media\ndata from the German-speaking area. For this, we identify the lexical items\nthat are the most impactful for the variety classification. We find that the\nextracted lexical features are indeed representative of their respective\nvarieties and note that the trained models also rely on place names for\nclassifications.\n","authors":["Dana Roemling","Yves Scherrer","Aleksandra Miletic"],"pdf_url":"https://arxiv.org/pdf/2404.18510v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12713v2","updated":"2024-07-01T15:42:15Z","published":"2024-02-20T04:26:08Z","title":"Are LLMs Rational Investors? A Study on Detecting and Reducing the\n Financial Bias in LLMs","summary":" Large Language Models (LLMs) are increasingly adopted in financial analysis\nfor interpreting complex market data and trends. However, their use is\nchallenged by intrinsic biases (e.g., risk-preference bias) and a superficial\nunderstanding of market intricacies, necessitating a thorough assessment of\ntheir financial insight. To address these issues, we introduce Financial Bias\nIndicators (FBI), a framework with components like Bias Unveiler, Bias\nDetective, Bias Tracker, and Bias Antidote to identify, detect, analyze, and\neliminate irrational biases in LLMs. By combining behavioral finance principles\nwith bias examination, we evaluate 23 leading LLMs and propose a de-biasing\nmethod based on financial causal knowledge. Results show varying degrees of\nfinancial irrationality among models, influenced by their design and training.\nModels trained specifically on financial datasets may exhibit more\nirrationality, and even larger financial language models (FinLLMs) can show\nmore bias than smaller, general models. We utilize four prompt-based methods\nincorporating causal debiasing, effectively reducing financial biases in these\nmodels. This work enhances the understanding of LLMs' bias in financial\napplications, laying the foundation for developing more reliable and rational\nfinancial analysis tools.\n","authors":["Yuhang Zhou","Yuchen Ni","Yunhui Gan","Zhangyue Yin","Xiang Liu","Jian Zhang","Sen Liu","Xipeng Qiu","Guangnan Ye","Hongfeng Chai"],"pdf_url":"https://arxiv.org/pdf/2402.12713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15399v2","updated":"2024-07-01T15:33:51Z","published":"2023-08-29T15:57:32Z","title":"Rethinking Machine Ethics -- Can LLMs Perform Moral Reasoning through\n the Lens of Moral Theories?","summary":" Making moral judgments is an essential step toward developing ethical AI\nsystems. Prevalent approaches are mostly implemented in a bottom-up manner,\nwhich uses a large set of annotated data to train models based on crowd-sourced\nopinions about morality. These approaches have been criticized for\novergeneralizing the moral stances of a limited group of annotators and lacking\nexplainability. This work proposes a flexible top-down framework to steer\n(Large) Language Models (LMs) to perform moral reasoning with well-established\nmoral theories from interdisciplinary research. The theory-guided top-down\nframework can incorporate various moral theories. Our experiments demonstrate\nthe effectiveness of the proposed framework on datasets derived from moral\ntheories. Furthermore, we show the alignment between different moral theories\nand existing morality datasets. Our analysis exhibits the potential and flaws\nin existing resources (models and datasets) in developing explainable moral\njudgment-making systems.\n","authors":["Jingyan Zhou","Minda Hu","Junan Li","Xiaoying Zhang","Xixin Wu","Irwin King","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.15399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09100v2","updated":"2024-07-01T15:29:45Z","published":"2023-03-16T06:09:15Z","title":"Patch-Prompt Aligned Bayesian Prompt Tuning for Vision-Language Models","summary":" For downstream applications of vision-language pre-trained models, there has\nbeen significant interest in constructing effective prompts. Existing works on\nprompt engineering, which either require laborious manual designs or optimize\nthe prompt tuning as a point estimation problem, may fail to describe diverse\ncharacteristics of categories and limit their applications. We introduce a\nBayesian probabilistic resolution to prompt tuning, where the label-specific\nstochastic prompts are generated hierarchically by first sampling a latent\nvector from an underlying distribution and then employing a lightweight\ngenerative model. Importantly, we semantically regularize the tuning process by\nminimizing the statistical distance between the visual patches and linguistic\nprompts, which pushes the stochastic label representations to faithfully\ncapture diverse visual concepts, instead of overfitting the training\ncategories. We evaluate the effectiveness of our approach on four tasks:\nfew-shot image recognition, base-to-new generalization, dataset transfer\nlearning, and domain shifts. Extensive results over 15 datasets show promising\ntransferability and generalization performance of our proposed model, both\nquantitatively and qualitatively.\n","authors":["Xinyang Liu","Dongsheng Wang","Bowei Fang","Miaoge Li","Zhibin Duan","Yishi Xu","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.09100v2.pdf","comment":"Accepted by UAI 2024"},{"id":"http://arxiv.org/abs/2406.17716v2","updated":"2024-07-01T15:19:51Z","published":"2024-06-25T16:58:19Z","title":"ViANLI: Adversarial Natural Language Inference for Vietnamese","summary":" The development of Natural Language Processing (NLI) datasets and models has\nbeen inspired by innovations in annotation design. With the rapid development\nof machine learning models today, the performance of existing machine learning\nmodels has quickly reached state-of-the-art results on a variety of tasks\nrelated to natural language processing, including natural language inference\ntasks. By using a pre-trained model during the annotation process, it is\npossible to challenge current NLI models by having humans produce\npremise-hypothesis combinations that the machine model cannot correctly\npredict. To remain attractive and challenging in the research of natural\nlanguage inference for Vietnamese, in this paper, we introduce the adversarial\nNLI dataset to the NLP research community with the name ViANLI. This data set\ncontains more than 10K premise-hypothesis pairs and is built by a continuously\nadjusting process to obtain the most out of the patterns generated by the\nannotators. ViANLI dataset has brought many difficulties to many current SOTA\nmodels when the accuracy of the most powerful model on the test set only\nreached 48.4%. Additionally, the experimental results show that the models\ntrained on our dataset have significantly improved the results on other\nVietnamese NLI datasets.\n","authors":["Tin Van Huynh","Kiet Van Nguyen","Ngan Luu-Thuy Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.17716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13261v2","updated":"2024-07-01T15:18:07Z","published":"2024-06-19T06:46:59Z","title":"BeHonest: Benchmarking Honesty of Large Language Models","summary":" Previous works on Large Language Models (LLMs) have mainly focused on\nevaluating their helpfulness or harmlessness. However, honesty, another crucial\nalignment criterion, has received relatively less attention. Dishonest\nbehaviors in LLMs, such as spreading misinformation and defrauding users,\neroding user trust, and causing real-world harm, present severe risks that\nintensify as these models approach superintelligence levels. Enhancing honesty\nin LLMs addresses critical deficiencies and helps uncover latent capabilities\nthat are not readily expressed. This underscores the urgent need for reliable\nmethods and benchmarks to effectively ensure and evaluate the honesty of LLMs.\n In this paper, we introduce BeHonest, a pioneering benchmark specifically\ndesigned to assess honesty in LLMs comprehensively. BeHonest evaluates three\nessential aspects of honesty: awareness of knowledge boundaries, avoidance of\ndeceit, and consistency in responses. Building on this foundation, we designed\n10 scenarios to evaluate and analyze 9 popular LLMs on the market, including\nboth closed-source and open-source models from different model families with\nvaried model sizes. Our findings indicate that there is still significant room\nfor improvement in the honesty of LLMs. We also encourage the AI community to\nprioritize honesty alignment in LLMs. Our benchmark and code can be found at:\n\\url{https://github.com/GAIR-NLP/BeHonest}.\n","authors":["Steffi Chern","Zhulin Hu","Yuqing Yang","Ethan Chern","Yuan Guo","Jiahe Jin","Binjie Wang","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2406.13261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12070v2","updated":"2024-07-01T15:17:10Z","published":"2024-01-22T16:09:47Z","title":"Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated\n Text","summary":" Detecting text generated by modern large language models is thought to be\nhard, as both LLMs and humans can exhibit a wide range of complex behaviors.\nHowever, we find that a score based on contrasting two closely related language\nmodels is highly accurate at separating human-generated and machine-generated\ntext. Based on this mechanism, we propose a novel LLM detector that only\nrequires simple calculations using a pair of pre-trained LLMs. The method,\ncalled Binoculars, achieves state-of-the-art accuracy without any training\ndata. It is capable of spotting machine text from a range of modern LLMs\nwithout any model-specific modifications. We comprehensively evaluate\nBinoculars on a number of text sources and in varied situations. Over a wide\nrange of document types, Binoculars detects over 90% of generated samples from\nChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being\ntrained on any ChatGPT data.\n","authors":["Abhimanyu Hans","Avi Schwarzschild","Valeriia Cherepanova","Hamid Kazemi","Aniruddha Saha","Micah Goldblum","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2401.12070v2.pdf","comment":"20 pages, code available at https://github.com/ahans30/Binoculars"},{"id":"http://arxiv.org/abs/2311.01862v2","updated":"2024-07-01T14:59:58Z","published":"2023-11-03T12:11:12Z","title":"$R^3$-NL2GQL: A Model Coordination and Knowledge Graph Alignment\n Approach for NL2GQL","summary":" While current tasks of converting natural language to SQL (NL2SQL) using\nFoundation Models have shown impressive achievements, adapting these approaches\nfor converting natural language to Graph Query Language (NL2GQL) encounters\nhurdles due to the distinct nature of GQL compared to SQL, alongside the\ndiverse forms of GQL. Moving away from traditional rule-based and slot-filling\nmethodologies, we introduce a novel approach, $R^3$-NL2GQL, integrating both\nsmall and large Foundation Models for ranking, rewriting, and refining tasks.\nThis method leverages the interpretative strengths of smaller models for\ninitial ranking and rewriting stages, while capitalizing on the superior\ngeneralization and query generation prowess of larger models for the final\ntransformation of natural language queries into GQL formats. Addressing the\nscarcity of datasets in this emerging field, we have developed a bilingual\ndataset, sourced from graph database manuals and selected open-source Knowledge\nGraphs (KGs). Our evaluation of this methodology on this dataset demonstrates\nits promising efficacy and robustness.\n","authors":["Yuhang Zhou","Yu He","Siyu Tian","Yuchen Ni","Zhangyue Yin","Xiang Liu","Chuanjun Ji","Sen Liu","Xipeng Qiu","Guangnan Ye","Hongfeng Chai"],"pdf_url":"https://arxiv.org/pdf/2311.01862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10882v2","updated":"2024-07-01T14:55:01Z","published":"2024-06-16T10:10:37Z","title":"SCAR: Efficient Instruction-Tuning for Large Language Models via Style\n Consistency-Aware Response Ranking","summary":" Recent studies have shown that maintaining a consistent response style by\nhuman experts and enhancing data quality in training sets can significantly\nimprove the performance of fine-tuned Large Language Models (LLMs) while\nreducing the number of training examples needed. However, the precise\ndefinition of style and the relationship between style, data quality, and LLM\nperformance remains unclear. This research decomposes response style into\npresentation and composition styles and finds that, among training data of\nsimilar quality, those with higher style consistency lead to better LLM\nperformance. Inspired by this, we introduce Style Consistency-Aware Response\nRanking (SCAR), which automatically prioritizes instruction-response pairs in\nthe training set based on their response stylistic consistency. By selecting\nthe most style-consistent examples, ranging from the top 25% to 0.7% of the\nfull dataset, the fine-tuned LLMs can match or even surpass the performance of\nmodels trained on the entire dataset in coding and open-ended\nquestion-answering benchmarks. Code and data are available at\nhttps://github.com/zhuang-li/SCAR .\n","authors":["Zhuang Li","Yuncheng Hua","Thuy-Trang Vu","Haolan Zhan","Lizhen Qu","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2406.10882v2.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2404.15146v2","updated":"2024-07-01T14:43:11Z","published":"2024-04-23T15:49:37Z","title":"Rethinking LLM Memorization through the Lens of Adversarial Compression","summary":" Large language models (LLMs) trained on web-scale datasets raise substantial\nconcerns regarding permissible data usage. One major question is whether these\nmodels \"memorize\" all their training data or they integrate many data sources\nin some way more akin to how a human would learn and synthesize information.\nThe answer hinges, to a large degree, on how we define memorization. In this\nwork, we propose the Adversarial Compression Ratio (ACR) as a metric for\nassessing memorization in LLMs. A given string from the training data is\nconsidered memorized if it can be elicited by a prompt (much) shorter than the\nstring itself -- in other words, if these strings can be \"compressed\" with the\nmodel by computing adversarial prompts of fewer tokens. The ACR overcomes the\nlimitations of existing notions of memorization by (i) offering an adversarial\nview of measuring memorization, especially for monitoring unlearning and\ncompliance; and (ii) allowing for the flexibility to measure memorization for\narbitrary strings at a reasonably low compute. Our definition serves as a\npractical tool for determining when model owners may be violating terms around\ndata usage, providing a potential legal tool and a critical lens through which\nto address such scenarios.\n","authors":["Avi Schwarzschild","Zhili Feng","Pratyush Maini","Zachary C. Lipton","J. Zico Kolter"],"pdf_url":"https://arxiv.org/pdf/2404.15146v2.pdf","comment":"https://locuslab.github.io/acr-memorization"},{"id":"http://arxiv.org/abs/2406.06764v3","updated":"2024-07-01T14:37:09Z","published":"2024-06-10T19:50:16Z","title":"$Classi|Q\\rangle$ Towards a Translation Framework To Bridge The\n Classical-Quantum Programming Gap","summary":" Quantum computing, albeit readily available as hardware or emulated on the\ncloud, is still far from being available in general regarding complex\nprogramming paradigms and learning curves. This vision paper introduces\n$Classi|Q\\rangle$, a translation framework idea to bridge Classical and Quantum\nComputing by translating high-level programming languages, e.g., Python or C++,\ninto a low-level language, e.g., Quantum Assembly. Our idea paper serves as a\nblueprint for ongoing efforts in quantum software engineering, offering a\nroadmap for further $Classi|Q\\rangle$ development to meet the diverse needs of\nresearchers and practitioners. $Classi|Q\\rangle$ is designed to empower\nresearchers and practitioners with no prior quantum experience to harness the\npotential of hybrid quantum computation. We also discuss future enhancements to\n$Classi|Q\\rangle$, including support for additional quantum languages, improved\noptimization strategies, and integration with emerging quantum computing\nplatforms.\n","authors":["Matteo Esposito","Maryam Tavassoli Sabzevari","Boshuai Ye","Davide Falessi","Arif Ali Khan","Davide Taibi"],"pdf_url":"https://arxiv.org/pdf/2406.06764v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11464v2","updated":"2024-07-01T14:27:51Z","published":"2024-05-19T06:43:12Z","title":"Efficient Prompt Tuning by Multi-Space Projection and Prompt Fusion","summary":" Prompt tuning is a promising method to fine-tune a pre-trained language model\nwithout retraining its large-scale parameters. Instead, it attaches a soft\nprompt to the input text, whereby downstream tasks can be well adapted by\nmerely learning the embeddings of prompt tokens. Nevertheless, existing methods\nstill suffer from two challenges: (i) they are hard to balance accuracy and\nefficiency. A longer (shorter) soft prompt generally leads to a better(worse)\naccuracy but at the cost of more (less) training time. (ii)The performance may\nnot be consistent when adapting to different downstream tasks. We attribute it\nto the same embedding space but responsible for different requirements of\ndownstream tasks. To address these issues, we propose an Efficient Prompt\nTuning method (EPT) by multi-space projection and prompt fusion. Specifically,\nit decomposes a given soft prompt into a shorter prompt and two low-rank\nmatrices, significantly reducing the training time. Accuracy is also enhanced\nby leveraging low-rank matrices and the short prompt as additional knowledge\nsources to enrich the semantics of the original short prompt. In addition, we\nproject the soft prompt into multiple subspaces to improve the performance\nconsistency, and then adaptively learn the combination weights of different\nspaces through a gating network. Experiments on 13 natural language processing\ndownstream tasks show that our method significantly and consistently\noutperforms 11 comparison methods with the relative percentage of improvements\nup to 12.9%, and training time decreased by 14%.\n","authors":["Pengxiang Lan","Enneng Yang","Yuting Liu","Guibing Guo","Linying Jiang","Jianzhe Zhao","Xingwei Wang"],"pdf_url":"https://arxiv.org/pdf/2405.11464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11720v2","updated":"2024-07-01T13:49:45Z","published":"2023-12-18T21:42:34Z","title":"Assessing Logical Reasoning Capabilities of Encoder-Only Transformer\n Models","summary":" Logical reasoning is central to complex human activities, such as thinking,\ndebating, and planning; it is also a central component of many AI systems as\nwell. In this paper, we investigate the extent to which encoder-only\ntransformer language models (LMs) can reason according to logical rules. We ask\nwhether those LMs can deduce theorems in propositional calculus and first-order\nlogic; if their relative success in these problems reflects general logical\ncapabilities; and which layers contribute the most to the task. First, we show\nfor several encoder-only LMs that they can be trained, to a reasonable degree,\nto determine logical validity on various datasets. Next, by cross-probing\nfine-tuned models on these datasets, we show that LMs have difficulty in\ntransferring their putative logical reasoning ability, which suggests that they\nmay have learned dataset-specific features, instead of a general capability.\nFinally, we conduct a layerwise probing experiment, which shows that the\nhypothesis classification task is mostly solved through higher layers.\n","authors":["Paulo Pirozelli","Marcos M. José","Paulo de Tarso P. Filho","Anarosa A. F. Brandão","Fabio G. Cozman"],"pdf_url":"https://arxiv.org/pdf/2312.11720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10625v2","updated":"2024-07-01T13:36:29Z","published":"2024-06-15T13:16:44Z","title":"On the Hardness of Faithful Chain-of-Thought Reasoning in Large Language\n Models","summary":" As Large Language Models (LLMs) are increasingly being employed in real-world\napplications in critical domains such as healthcare, it is important to ensure\nthat the Chain-of-Thought (CoT) reasoning generated by these models faithfully\ncaptures their underlying behavior.\n While LLMs are known to generate CoT reasoning that is appealing to humans,\nprior studies have shown that these explanations do not accurately reflect the\nactual behavior of the underlying LLMs. In this work, we explore the promise of\nthree broad approaches commonly employed to steer the behavior of LLMs to\nenhance the faithfulness of the CoT reasoning generated by LLMs: in-context\nlearning, fine-tuning, and activation editing. Specifically, we introduce novel\nstrategies for in-context learning, fine-tuning, and activation editing aimed\nat improving the faithfulness of the CoT reasoning. We then carry out extensive\nempirical analyses with multiple benchmark datasets to explore the promise of\nthese strategies. Our analyses indicate that these strategies offer limited\nsuccess in improving the faithfulness of the CoT reasoning, with only slight\nperformance enhancements in controlled scenarios. Activation editing\ndemonstrated minimal success, while fine-tuning and in-context learning\nachieved marginal improvements that failed to generalize across diverse\nreasoning and truthful question-answering benchmarks. In summary, our work\nunderscores the inherent difficulty in eliciting faithful CoT reasoning from\nLLMs, suggesting that the current array of approaches may not be sufficient to\naddress this complex challenge.\n","authors":["Sree Harsha Tanneru","Dan Ley","Chirag Agarwal","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2406.10625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07945v3","updated":"2024-07-01T13:25:57Z","published":"2023-11-14T06:45:31Z","title":"First-Step Advantage: Importance of Starting Right in Multi-Step Math\n Reasoning","summary":" Language models can solve complex reasoning tasks better by learning to\ngenerate rationales for their predictions. Often these models know how to solve\na task but their auto-regressive decoding nature leads to incorrect results if\nthey start incorrectly. We observe that smaller models in particular when\ncorrected, can solve a task that they would have otherwise struggled with. We\ndemonstrate this phenomenon by using a larger model to guide smaller models,\nwhich leads to significantly improved performance (up to +24 points on the\nGSM8K dataset by 7B models). To assist smaller models in initiating the\nstarting step, we propose QuestCoT, where a smaller model first asks itself how\nto start, before proceeding with a chain of reasoning. On various multistep\nmathematical reasoning datasets over multiple smaller models, we show that\ngetting the right start can lead to significant performance gains across all\nmodels (gains of up to +6 points on GSM8K, +9 on SVAMP, +5 on ASDiv, and +7 on\nMultiArith).\n","authors":["Kushal Jain","Moritz Miller","Niket Tandon","Kumar Shridhar"],"pdf_url":"https://arxiv.org/pdf/2311.07945v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06371v2","updated":"2024-07-01T13:16:49Z","published":"2024-04-09T15:07:25Z","title":"Model Generation with LLMs: From Requirements to UML Sequence Diagrams","summary":" Complementing natural language (NL) requirements with graphical models can\nimprove stakeholders' communication and provide directions for system design.\nHowever, creating models from requirements involves manual effort. The advent\nof generative large language models (LLMs), ChatGPT being a notable example,\noffers promising avenues for automated assistance in model generation. This\npaper investigates the capability of ChatGPT to generate a specific type of\nmodel, i.e., UML sequence diagrams, from NL requirements. We conduct a\nqualitative study in which we examine the sequence diagrams generated by\nChatGPT for 28 requirements documents of various types and from different\ndomains. Observations from the analysis of the generated diagrams have\nsystematically been captured through evaluation logs, and categorized through\nthematic analysis. Our results indicate that, although the models generally\nconform to the standard and exhibit a reasonable level of understandability,\ntheir completeness and correctness with respect to the specified requirements\noften present challenges. This issue is particularly pronounced in the presence\nof requirements smells, such as ambiguity and inconsistency. The insights\nderived from this study can influence the practical utilization of LLMs in the\nRE process, and open the door to novel RE-specific prompting strategies\ntargeting effective model generation.\n","authors":["Alessio Ferrari","Sallam Abualhaija","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2404.06371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10208v2","updated":"2024-07-01T12:48:51Z","published":"2024-02-15T18:59:02Z","title":"Recovering the Pre-Fine-Tuning Weights of Generative Models","summary":" The dominant paradigm in generative modeling consists of two steps: i)\npre-training on a large-scale but unsafe dataset, ii) aligning the pre-trained\nmodel with human values via fine-tuning. This practice is considered safe, as\nno current method can recover the unsafe, pre-fine-tuning model weights. In\nthis paper, we demonstrate that this assumption is often false. Concretely, we\npresent Spectral DeTuning, a method that can recover the weights of the\npre-fine-tuning model using a few low-rank (LoRA) fine-tuned models. In\ncontrast to previous attacks that attempt to recover pre-fine-tuning\ncapabilities, our method aims to recover the exact pre-fine-tuning weights. Our\napproach exploits this new vulnerability against large-scale models such as a\npersonalized Stable Diffusion and an aligned Mistral.\n","authors":["Eliahu Horwitz","Jonathan Kahana","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2402.10208v2.pdf","comment":"ICML 2024. Project page: https://vision.huji.ac.il/spectral_detuning/"},{"id":"http://arxiv.org/abs/2406.13663v2","updated":"2024-07-01T12:39:26Z","published":"2024-06-19T16:10:26Z","title":"Model Internals-based Answer Attribution for Trustworthy\n Retrieval-Augmented Generation","summary":" Ensuring the verifiability of model answers is a fundamental challenge for\nretrieval-augmented generation (RAG) in the question answering (QA) domain.\nRecently, self-citation prompting was proposed to make large language models\n(LLMs) generate citations to supporting documents along with their answers.\nHowever, self-citing LLMs often struggle to match the required format, refer to\nnon-existent sources, and fail to faithfully reflect LLMs' context usage\nthroughout the generation. In this work, we present MIRAGE --Model\nInternals-based RAG Explanations -- a plug-and-play approach using model\ninternals for faithful answer attribution in RAG applications. MIRAGE detects\ncontext-sensitive answer tokens and pairs them with retrieved documents\ncontributing to their prediction via saliency methods. We evaluate our proposed\napproach on a multilingual extractive QA dataset, finding high agreement with\nhuman answer attribution. On open-ended QA, MIRAGE achieves citation quality\nand efficiency comparable to self-citation while also allowing for a\nfiner-grained control of attribution parameters. Our qualitative evaluation\nhighlights the faithfulness of MIRAGE's attributions and underscores the\npromising application of model internals for RAG answer attribution.\n","authors":["Jirui Qi","Gabriele Sarti","Raquel Fernández","Arianna Bisazza"],"pdf_url":"https://arxiv.org/pdf/2406.13663v2.pdf","comment":"Under review. Code and data released at\n https://github.com/Betswish/MIRAGE"},{"id":"http://arxiv.org/abs/2310.14863v2","updated":"2024-07-01T12:32:37Z","published":"2023-10-23T12:32:41Z","title":"Paraphrase Types for Generation and Detection","summary":" Current approaches in paraphrase generation and detection heavily rely on a\nsingle general similarity score, ignoring the intricate linguistic properties\nof language. This paper introduces two new tasks to address this shortcoming by\nconsidering paraphrase types - specific linguistic perturbations at particular\ntext positions. We name these tasks Paraphrase Type Generation and Paraphrase\nType Detection. Our results suggest that while current techniques perform well\nin a binary classification scenario, i.e., paraphrased or not, the inclusion of\nfine-grained paraphrase types poses a significant challenge. While most\napproaches are good at generating and detecting general semantic similar\ncontent, they fail to understand the intrinsic linguistic variables they\nmanipulate. Models trained in generating and identifying paraphrase types also\nshow improvements in tasks without them. In addition, scaling these models\nfurther improves their ability to understand paraphrase types. We believe\nparaphrase types can unlock a new paradigm for developing paraphrase models and\nsolving tasks in the future.\n","authors":["Jan Philip Wahle","Bela Gipp","Terry Ruas"],"pdf_url":"https://arxiv.org/pdf/2310.14863v2.pdf","comment":"Published at EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.14870v2","updated":"2024-07-01T12:31:46Z","published":"2023-10-23T12:42:06Z","title":"We are Who We Cite: Bridges of Influence Between Natural Language\n Processing and Other Academic Fields","summary":" Natural Language Processing (NLP) is poised to substantially influence the\nworld. However, significant progress comes hand-in-hand with substantial risks.\nAddressing them requires broad engagement with various fields of study. Yet,\nlittle empirical work examines the state of such engagement (past or current).\nIn this paper, we quantify the degree of influence between 23 fields of study\nand NLP (on each other). We analyzed ~77k NLP papers, ~3.1m citations from NLP\npapers to other papers, and ~1.8m citations from other papers to NLP papers. We\nshow that, unlike most fields, the cross-field engagement of NLP, measured by\nour proposed Citation Field Diversity Index (CFDI), has declined from 0.58 in\n1980 to 0.31 in 2022 (an all-time low). In addition, we find that NLP has grown\nmore insular -- citing increasingly more NLP papers and having fewer papers\nthat act as bridges between fields. NLP citations are dominated by computer\nscience; Less than 8% of NLP citations are to linguistics, and less than 3% are\nto math and psychology. These findings underscore NLP's urgent need to reflect\non its engagement with various fields.\n","authors":["Jan Philip Wahle","Terry Ruas","Mohamed Abdalla","Bela Gipp","Saif M. Mohammad"],"pdf_url":"https://arxiv.org/pdf/2310.14870v2.pdf","comment":"Published at EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.02797v3","updated":"2024-07-01T12:30:57Z","published":"2023-05-04T12:57:18Z","title":"The Elephant in the Room: Analyzing the Presence of Big Tech in Natural\n Language Processing Research","summary":" Recent advances in deep learning methods for natural language processing\n(NLP) have created new business opportunities and made NLP research critical\nfor industry development. As one of the big players in the field of NLP,\ntogether with governments and universities, it is important to track the\ninfluence of industry on research. In this study, we seek to quantify and\ncharacterize industry presence in the NLP community over time. Using a corpus\nwith comprehensive metadata of 78,187 NLP publications and 701 resumes of NLP\npublication authors, we explore the industry presence in the field since the\nearly 90s. We find that industry presence among NLP authors has been steady\nbefore a steep increase over the past five years (180% growth from 2017 to\n2022). A few companies account for most of the publications and provide funding\nto academic researchers through grants and internships. Our study shows that\nthe presence and impact of the industry on natural language processing research\nare significant and fast-growing. This work calls for increased transparency of\nindustry influence in the field.\n","authors":["Mohamed Abdalla","Jan Philip Wahle","Terry Ruas","Aurélie Névéol","Fanny Ducel","Saif M. Mohammad","Karën Fort"],"pdf_url":"https://arxiv.org/pdf/2305.02797v3.pdf","comment":"Published at ACL 2023"},{"id":"http://arxiv.org/abs/2406.03855v2","updated":"2024-07-01T11:36:02Z","published":"2024-06-06T08:41:46Z","title":"Performance of large language models in numerical vs. semantic medical\n knowledge: Benchmarking on evidence-based Q&As","summary":" Clinical problem-solving requires processing of semantic medical knowledge\nsuch as illness scripts and numerical medical knowledge of diagnostic tests for\nevidence-based decision-making. As large language models (LLMs) show promising\nresults in many aspects of language-based clinical practice, their ability to\ngenerate non-language evidence-based answers to clinical questions is\ninherently limited by tokenization. Therefore, we evaluated LLMs' performance\non two question types: numeric (correlating findings) and semantic\n(differentiating entities) while examining differences within and between LLMs\nin medical aspects and comparing their performance to humans. To generate\nstraightforward multi-choice questions and answers (QAs) based on\nevidence-based medicine (EBM), we used a comprehensive medical knowledge graph\n(encompassed data from more than 50,00 peer-reviewed articles) and created the\n\"EBMQA\". EBMQA contains 105,000 QAs labeled with medical and non-medical topics\nand classified into numerical or semantic questions. We benchmarked this\ndataset using more than 24,500 QAs on two state-of-the-art LLMs: Chat-GPT4 and\nClaude3-Opus. We evaluated the LLMs accuracy on semantic and numerical question\ntypes and according to sub-labeled topics. For validation, six medical experts\nwere tested on 100 numerical EBMQA questions. We found that both LLMs excelled\nmore in semantic than numerical QAs, with Claude3 surpassing GPT4 in numerical\nQAs. However, both LLMs showed inter and intra gaps in different medical\naspects and remained inferior to humans. Thus, their medical advice should be\naddressed carefully.\n","authors":["Eden Avnat","Michal Levy","Daniel Herstain","Elia Yanko","Daniel Ben Joya","Michal Tzuchman Katz","Dafna Eshel","Sahar Laros","Yael Dagan","Shahar Barami","Joseph Mermelstein","Shahar Ovadia","Noam Shomron","Varda Shalev","Raja-Elie E. Abdulnour"],"pdf_url":"https://arxiv.org/pdf/2406.03855v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02177v2","updated":"2024-07-01T11:31:21Z","published":"2024-03-04T16:21:19Z","title":"ProTrix: Building Models for Planning and Reasoning over Tables with\n Sentence Context","summary":" Tables play a crucial role in conveying information in various domains. We\npropose a Plan-then-Reason framework to answer different types of user queries\nover tables with sentence context. The framework first plans the reasoning\npaths over the context, then assigns each step to program-based or textual\nreasoning to reach the final answer. This framework enhances the table\nreasoning abilities for both in-context learning and fine-tuning methods.\nGPT-3.5-Turbo following Plan-then-Reason framework surpasses other prompting\nbaselines without self-consistency while using less API calls and in-context\ndemonstrations. We also construct an instruction tuning set TrixInstruct to\nevaluate the effectiveness of fine-tuning with this framework. We present\nProTrix model family by finetuning models on TrixInstruct. Our experiments show\nthat ProTrix family generalizes to diverse unseen tabular tasks with only 6k\ntraining instances. We further demonstrate that ProTrix can generate accurate\nand faithful explanations to answer complex free-form questions. Our work\nunderscores the importance of the planning and reasoning abilities towards a\nmodel over tabular tasks with generalizability and interpretability. We\nopen-source our dataset and models at https://github.com/WilliamZR/ProTrix.\n","authors":["Zirui Wu","Yansong Feng"],"pdf_url":"https://arxiv.org/pdf/2403.02177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11985v3","updated":"2024-07-01T10:40:19Z","published":"2023-12-19T09:26:46Z","title":"Climate Change from Large Language Models","summary":" Climate change poses grave challenges, demanding widespread understanding and\nlow-carbon lifestyle awareness. Large language models (LLMs) offer a powerful\ntool to address this crisis, yet comprehensive evaluations of their\nclimate-crisis knowledge are lacking. This paper proposes an automated\nevaluation framework to assess climate-crisis knowledge within LLMs. We adopt a\nhybrid approach for data acquisition, combining data synthesis and manual\ncollection, to compile a diverse set of questions encompassing various aspects\nof climate change. Utilizing prompt engineering based on the compiled\nquestions, we evaluate the model's knowledge by analyzing its generated\nanswers. Furthermore, we introduce a comprehensive set of metrics to assess\nclimate-crisis knowledge, encompassing indicators from 10 distinct\nperspectives. These metrics provide a multifaceted evaluation, enabling a\nnuanced understanding of the LLMs' climate crisis comprehension. The\nexperimental results demonstrate the efficacy of our proposed method. In our\nevaluation utilizing diverse high-performing LLMs, we discovered that while\nLLMs possess considerable climate-related knowledge, there are shortcomings in\nterms of timeliness, indicating a need for continuous updating and refinement\nof their climate-related content.\n","authors":["Hongyin Zhu","Prayag Tiwari"],"pdf_url":"https://arxiv.org/pdf/2312.11985v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02022v2","updated":"2024-07-01T10:38:59Z","published":"2024-04-02T15:10:11Z","title":"Improving Retrieval Augmented Open-Domain Question-Answering with\n Vectorized Contexts","summary":" In the era of large language models, applying techniques such as Retrieval\nAugmented Generation can better address Open-Domain Question-Answering\nproblems. Due to constraints including model sizes and computing resources, the\nlength of context is often limited, and it becomes challenging to empower the\nmodel to cover overlong contexts while answering questions from open domains.\nThis paper proposes a general and convenient method to covering longer contexts\nin Open-Domain Question-Answering tasks. It leverages a small encoder language\nmodel that effectively encodes contexts, and the encoding applies\ncross-attention with origin inputs. With our method, the origin language models\ncan cover several times longer contexts while keeping the computing\nrequirements close to the baseline. Our experiments demonstrate that after\nfine-tuning, there is improved performance across two held-in datasets, four\nheld-out datasets, and also in two In Context Learning settings.\n","authors":["Zhuo Chen","Xinyu Wang","Yong Jiang","Pengjun Xie","Fei Huang","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2404.02022v2.pdf","comment":"ACL2023 Findings"},{"id":"http://arxiv.org/abs/2406.10288v2","updated":"2024-07-01T10:17:58Z","published":"2024-06-12T18:33:11Z","title":"Mimicking User Data: On Mitigating Fine-Tuning Risks in Closed Large\n Language Models","summary":" Fine-tuning large language models on small, high-quality datasets can enhance\ntheir performance on specific downstream tasks. Recent research shows that\nfine-tuning on benign, instruction-following data can inadvertently undo the\nsafety alignment process and increase a model's propensity to comply with\nharmful queries. Although critical, understanding and mitigating safety risks\nin well-defined tasks remains distinct from the instruction-following context\ndue to structural differences in the data. Our work addresses the gap in our\nunderstanding of these risks across diverse types of data in closed models -\nwhere providers control how user data is utilized in the fine-tuning process.\nWe demonstrate how malicious actors can subtly manipulate the structure of\nalmost any task-specific dataset to foster significantly more dangerous model\nbehaviors, while maintaining an appearance of innocuity and reasonable\ndownstream task performance. To address this issue, we propose a novel\nmitigation strategy that mixes in safety data which mimics the task format and\nprompting style of the user data, showing this is more effective than existing\nbaselines at re-establishing safety alignment while maintaining similar task\nperformance.\n","authors":["Francisco Eiras","Aleksandar Petrov","Phillip H. S. Torr","M. Pawan Kumar","Adel Bibi"],"pdf_url":"https://arxiv.org/pdf/2406.10288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11096v2","updated":"2024-07-01T10:04:09Z","published":"2024-06-16T22:59:18Z","title":"The Potential and Challenges of Evaluating Attitudes, Opinions, and\n Values in Large Language Models","summary":" Recent advances in Large Language Models (LLMs) have sparked wide interest in\nvalidating and comprehending the human-like cognitive-behavioral traits LLMs\nmay have. These cognitive-behavioral traits include typically Attitudes,\nOpinions, Values (AOV). However, measuring AOV embedded within LLMs remains\nopaque, and different evaluation methods may yield different results. This has\nled to a lack of clarity on how different studies are related to each other and\nhow they can be interpreted. This paper aims to bridge this gap by providing an\noverview of recent works on the evaluation of AOV in LLMs. Moreover, we survey\nrelated approaches in different stages of the evaluation pipeline in these\nworks. By doing so, we address the potential and challenges with respect to\nunderstanding the model, human-AI alignment, and downstream application in\nsocial sciences. Finally, we provide practical insights into evaluation\nmethods, model enhancement, and interdisciplinary collaboration, thereby\ncontributing to the evolving landscape of evaluating AOV in LLMs.\n","authors":["Bolei Ma","Xinpeng Wang","Tiancheng Hu","Anna-Carolina Haensch","Michael A. Hedderich","Barbara Plank","Frauke Kreuter"],"pdf_url":"https://arxiv.org/pdf/2406.11096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04858v2","updated":"2024-07-01T10:03:33Z","published":"2024-02-07T13:55:27Z","title":"CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay","summary":" Large language models are increasingly solving tasks that are commonly\nbelieved to require human-level reasoning ability. However, these models still\nperform very poorly on benchmarks of general intelligence such as the\nAbstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a\nprogramming-by-examples problem, and introduce a novel and scalable method for\nlanguage model self-improvement called Code Iteration (CodeIt). Our method\niterates between 1) program sampling and hindsight relabeling, and 2) learning\nfrom prioritized experience replay. By relabeling the goal of an episode (i.e.,\nthe target program output given input) to the realized output produced by the\nsampled program, our method effectively deals with the extreme sparsity of\nrewards in program synthesis. Applying CodeIt to the ARC dataset, we\ndemonstrate that prioritized hindsight replay, along with pre-training and\ndata-augmentation, leads to successful inter-task generalization. CodeIt is the\nfirst neuro-symbolic approach that scales to the full ARC evaluation dataset.\nOur method solves 15% of ARC evaluation tasks, achieving state-of-the-art\nperformance and outperforming existing neural and symbolic baselines. Our code\nis available at https://github.com/Qualcomm-AI-research/codeit .\n","authors":["Natasha Butt","Blazej Manczak","Auke Wiggers","Corrado Rainone","David W. Zhang","Michaël Defferrard","Taco Cohen"],"pdf_url":"https://arxiv.org/pdf/2402.04858v2.pdf","comment":"ICML'24 camera-ready version"},{"id":"http://arxiv.org/abs/2403.13583v2","updated":"2024-07-01T09:59:47Z","published":"2024-03-20T13:33:55Z","title":"CoCoST: Automatic Complex Code Generation with Online Searching and\n Correctness Testing","summary":" Large Language Models have revolutionized code generation ability by\nconverting natural language descriptions into executable code. However,\ngenerating complex code within real-world scenarios remains challenging due to\nintricate structures, subtle bugs, understanding of advanced data types, and\nlack of supplementary contents. To address these challenges, we introduce the\nCoCoST framework, which enhances complex code generation by online searching\nfor more information with planned queries and correctness testing for code\nrefinement. Moreover, CoCoST serializes the complex inputs and outputs to\nimprove comprehension and generates test cases to ensure the adaptability for\nreal-world applications. CoCoST is validated through rigorous experiments on\nthe DS-1000 and ClassEval datasets. Experimental results show that CoCoST\nsubstantially improves the quality of complex code generation, highlighting its\npotential to enhance the practicality of LLMs in generating complex code.\n","authors":["Xinyi He","Jiaru Zou","Yun Lin","Mengyu Zhou","Shi Han","Zejian Yuan","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.13583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09022v3","updated":"2024-07-01T09:40:58Z","published":"2023-11-15T15:12:15Z","title":"Exploring the Potential of Large Language Models in Computational\n Argumentation","summary":" Computational argumentation has become an essential tool in various domains,\nincluding law, public policy, and artificial intelligence. It is an emerging\nresearch field in natural language processing that attracts increasing\nattention. Research on computational argumentation mainly involves two types of\ntasks: argument mining and argument generation. As large language models (LLMs)\nhave demonstrated impressive capabilities in understanding context and\ngenerating natural language, it is worthwhile to evaluate the performance of\nLLMs on diverse computational argumentation tasks. This work aims to embark on\nan assessment of LLMs, such as ChatGPT, Flan models, and LLaMA2 models, in both\nzero-shot and few-shot settings. We organize existing tasks into six main\ncategories and standardize the format of fourteen openly available datasets. In\naddition, we present a new benchmark dataset on counter speech generation that\naims to holistically evaluate the end-to-end performance of LLMs on argument\nmining and argument generation. Extensive experiments show that LLMs exhibit\ncommendable performance across most of the datasets, demonstrating their\ncapabilities in the field of argumentation. Our analysis offers valuable\nsuggestions for evaluating computational argumentation and its integration with\nLLMs in future research endeavors.\n","authors":["Guizhen Chen","Liying Cheng","Luu Anh Tuan","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2311.09022v3.pdf","comment":"Accepted at ACL 2024 Main"},{"id":"http://arxiv.org/abs/2402.11975v2","updated":"2024-07-01T09:38:06Z","published":"2024-02-19T09:19:50Z","title":"Compress to Impress: Unleashing the Potential of Compressive Memory in\n Real-World Long-Term Conversations","summary":" Existing retrieval-based methods have made significant strides in maintaining\nlong-term conversations. However, these approaches face challenges in memory\ndatabase management and accurate memory retrieval, hindering their efficacy in\ndynamic, real-world interactions. This study introduces a novel framework,\nCOmpressive Memory-Enhanced Dialogue sYstems (COMEDY), which eschews\ntraditional retrieval modules and memory databases. Instead, COMEDY adopts a\n\"One-for-All\" approach, utilizing a single language model to manage memory\ngeneration, compression, and response generation. Central to this framework is\nthe concept of compressive memory, which intergrates session-specific\nsummaries, user-bot dynamics, and past events into a concise memory format. To\nsupport COMEDY, we curated a large-scale Chinese instruction-tuning dataset,\nDolphin, derived from real user-chatbot interactions. Comparative evaluations\ndemonstrate COMEDY's superiority over traditional retrieval-based methods in\nproducing more nuanced and human-like conversational experiences. Our codes are\navailable at https://github.com/nuochenpku/COMEDY.\n","authors":["Nuo Chen","Hongguang Li","Juhua Huang","Baoyuan Wang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2402.11975v2.pdf","comment":"17pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.07440v2","updated":"2024-07-01T09:30:34Z","published":"2024-06-11T16:48:17Z","title":"Textual Similarity as a Key Metric in Machine Translation Quality\n Estimation","summary":" Machine Translation (MT) Quality Estimation (QE) assesses translation\nreliability without reference texts. This study introduces \"textual similarity\"\nas a new metric for QE, using sentence transformers and cosine similarity to\nmeasure semantic closeness. Analyzing data from the MLQE-PE dataset, we found\nthat textual similarity exhibits stronger correlations with human scores than\ntraditional metrics (hter, model evaluation, sentence probability etc.).\nEmploying GAMMs as a statistical tool, we demonstrated that textual similarity\nconsistently outperforms other metrics across multiple language pairs in\npredicting human scores. We also found that \"hter\" actually failed to predict\nhuman scores in QE. Our findings highlight the effectiveness of textual\nsimilarity as a robust QE metric, recommending its integration with other\nmetrics into QE frameworks and MT system training for improved accuracy and\nusability.\n","authors":["Kun Sun","Rong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.07440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13857v2","updated":"2024-07-01T09:23:10Z","published":"2023-05-23T09:24:53Z","title":"Revealing User Familiarity Bias in Task-Oriented Dialogue via\n Interactive Evaluation","summary":" Most task-oriented dialogue (TOD) benchmarks assume users that know exactly\nhow to use the system by constraining the user behaviors within the system's\ncapabilities via strict user goals, namely \"user familiarity\" bias. This data\nbias deepens when it combines with data-driven TOD systems, as it is impossible\nto fathom the effect of it with existing static evaluations. Hence, we conduct\nan interactive user study to unveil how vulnerable TOD systems are against\nrealistic scenarios. In particular, we compare users with 1) detailed goal\ninstructions that conform to the system boundaries (closed-goal) and 2) vague\ngoal instructions that are often unsupported but realistic (open-goal). Our\nstudy reveals that conversations in open-goal settings lead to catastrophic\nfailures of the system, in which 92% of the dialogues had significant issues.\nMoreover, we conduct a thorough analysis to identify distinctive features\nbetween the two settings through error annotation. From this, we discover a\nnovel \"pretending\" behavior, in which the system pretends to handle the user\nrequests even though they are beyond the system's capabilities. We discuss its\ncharacteristics and toxicity while showing recent large language models can\nalso suffer from this behavior.\n","authors":["Takyoung Kim","Jamin Shin","Young-Ho Kim","Sanghwan Bae","Sungdong Kim"],"pdf_url":"https://arxiv.org/pdf/2305.13857v2.pdf","comment":"NLP4ConvAI Workshop at ACL 2024"},{"id":"http://arxiv.org/abs/2402.16029v3","updated":"2024-07-01T09:15:38Z","published":"2024-02-25T08:41:32Z","title":"GraphWiz: An Instruction-Following Language Model for Graph Problems","summary":" Large language models (LLMs) have achieved impressive success across several\nfields, but their proficiency in understanding and resolving complex graph\nproblems is less explored. To bridge this gap, we introduce GraphInstruct, a\nnovel and comprehensive instruction-tuning dataset designed to equip language\nmodels with the ability to tackle a broad spectrum of graph problems using\nexplicit reasoning paths. Utilizing GraphInstruct, we build GraphWiz, an\nopen-source language model capable of resolving various graph problem types\nwhile generating clear reasoning processes. To enhance the model's capability\nand reliability, we incorporate the Direct Preference Optimization (DPO)\nframework into the graph problem-solving context. The enhanced model,\nGraphWiz-DPO, achieves an average accuracy of 65% across nine tasks with\ndifferent complexity levels, surpassing GPT-4 which has an average accuracy of\n43.8%. Moreover, our research delves into the delicate balance between training\ndata volume and model performance, highlighting the potential for overfitting\nwith increased data. We also explore the transferability of the model's\nreasoning ability across different graph tasks, indicating the model's\nadaptability and practical application potential. Our investigation offers a\nnew blueprint and valuable insights for developing LLMs specialized in graph\nreasoning and problem-solving.\n","authors":["Nuo Chen","Yuhan Li","Jianheng Tang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2402.16029v3.pdf","comment":"27pages, 15 tables"},{"id":"http://arxiv.org/abs/2402.10770v2","updated":"2024-07-01T08:59:08Z","published":"2024-02-16T15:48:33Z","title":"How Reliable Are Automatic Evaluation Methods for Instruction-Tuned\n LLMs?","summary":" Work on instruction-tuned Large Language Models (LLMs) has used automatic\nmethods based on text overlap and LLM judgments as cost-effective alternatives\nto human evaluation. In this paper, we perform a meta-evaluation of such\nmethods and assess their reliability across a broad range of tasks. We observe\nthat while automatic evaluation methods can approximate human ratings under\nspecific conditions, their validity is highly context-dependent. Specifically,\nthe simple ROUGE-L metric correlates well with human ratings for short-answer\nEnglish tasks but is unreliable in free-form generation tasks and cross-lingual\ntransfer. The effectiveness of the more advanced method of using GPT-4 as a\njudge diminishes significantly if reference answers are not included in the\nprompt, which is the scenario where this method has the potential to provide\nthe most value compared to other metrics. Our findings enhance the\nunderstanding of how automatic methods should be applied and interpreted when\ndeveloping and evaluating instruction-tuned LLMs.\n","authors":["Ehsan Doostmohammadi","Oskar Holmström","Marco Kuhlmann"],"pdf_url":"https://arxiv.org/pdf/2402.10770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04923v3","updated":"2024-07-01T07:15:33Z","published":"2023-11-03T08:59:51Z","title":"Is one brick enough to break the wall of spoken dialogue state tracking?","summary":" In Task-Oriented Dialogue (TOD) systems, correctly updating the system's\nunderstanding of the user's requests (\\textit{a.k.a} dialogue state tracking)\nis key to a smooth interaction. Traditionally, TOD systems perform this update\nin three steps: transcription of the user's utterance, semantic extraction of\nthe key concepts, and contextualization with the previously identified\nconcepts. Such cascade approaches suffer from cascading errors and separate\noptimization. End-to-End approaches have been proven helpful up to the\nturn-level semantic extraction step. This paper goes one step further and\nprovides (1) a novel approach for completely neural spoken DST, (2) an in depth\ncomparison with a state of the art cascade approach and (3) avenues towards\nbetter context propagation. Our study highlights that jointly-optimized\napproaches are also competitive for contextually dependent tasks, such as\nDialogue State Tracking (DST), especially in audio native settings. Context\npropagation in DST systems could benefit from training procedures accounting\nfor the previous' context inherent uncertainty.\n","authors":["Lucas Druart","Valentin Vielzeuf","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2311.04923v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18664v2","updated":"2024-07-01T07:12:45Z","published":"2024-06-26T18:09:46Z","title":"Evaluating Copyright Takedown Methods for Language Models","summary":" Language models (LMs) derive their capabilities from extensive training on\ndiverse data, including potentially copyrighted material. These models can\nmemorize and generate content similar to their training data, posing potential\nconcerns. Therefore, model creators are motivated to develop mitigation methods\nthat prevent generating protected content. We term this procedure as copyright\ntakedowns for LMs, noting the conceptual similarity to (but legal distinction\nfrom) the DMCA takedown This paper introduces the first evaluation of the\nfeasibility and side effects of copyright takedowns for LMs. We propose\nCoTaEval, an evaluation framework to assess the effectiveness of copyright\ntakedown methods, the impact on the model's ability to retain uncopyrightable\nfactual knowledge from the training data whose recitation is embargoed, and how\nwell the model maintains its general utility and efficiency. We examine several\nstrategies, including adding system prompts, decoding-time filtering\ninterventions, and unlearning approaches. Our findings indicate that no tested\nmethod excels across all metrics, showing significant room for research in this\nunique problem setting and indicating potential unresolved challenges for live\npolicy proposals.\n","authors":["Boyi Wei","Weijia Shi","Yangsibo Huang","Noah A. Smith","Chiyuan Zhang","Luke Zettlemoyer","Kai Li","Peter Henderson"],"pdf_url":"https://arxiv.org/pdf/2406.18664v2.pdf","comment":"31 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2402.05162v3","updated":"2024-07-01T07:11:17Z","published":"2024-02-07T18:34:38Z","title":"Assessing the Brittleness of Safety Alignment via Pruning and Low-Rank\n Modifications","summary":" Large language models (LLMs) show inherent brittleness in their safety\nmechanisms, as evidenced by their susceptibility to jailbreaking and even\nnon-malicious fine-tuning. This study explores this brittleness of safety\nalignment by leveraging pruning and low-rank modifications. We develop methods\nto identify critical regions that are vital for safety guardrails, and that are\ndisentangled from utility-relevant regions at both the neuron and rank levels.\nSurprisingly, the isolated regions we find are sparse, comprising about $3\\%$\nat the parameter level and $2.5\\%$ at the rank level. Removing these regions\ncompromises safety without significantly impacting utility, corroborating the\ninherent brittleness of the model's safety mechanisms. Moreover, we show that\nLLMs remain vulnerable to low-cost fine-tuning attacks even when modifications\nto the safety-critical regions are restricted. These findings underscore the\nurgent need for more robust safety strategies in LLMs.\n","authors":["Boyi Wei","Kaixuan Huang","Yangsibo Huang","Tinghao Xie","Xiangyu Qi","Mengzhou Xia","Prateek Mittal","Mengdi Wang","Peter Henderson"],"pdf_url":"https://arxiv.org/pdf/2402.05162v3.pdf","comment":"22 pages, 9 figures. Project page is available at\n https://boyiwei.com/alignment-attribution/"},{"id":"http://arxiv.org/abs/2406.17513v2","updated":"2024-07-01T06:48:34Z","published":"2024-06-25T12:51:06Z","title":"Benchmarking Mental State Representations in Language Models","summary":" While numerous works have assessed the generative performance of language\nmodels (LMs) on tasks requiring Theory of Mind reasoning, research into the\nmodels' internal representation of mental states remains limited. Recent work\nhas used probing to demonstrate that LMs can represent beliefs of themselves\nand others. However, these claims are accompanied by limited evaluation, making\nit difficult to assess how mental state representations are affected by model\ndesign and training choices. We report an extensive benchmark with various LM\ntypes with different model sizes, fine-tuning approaches, and prompt designs to\nstudy the robustness of mental state representations and memorisation issues\nwithin the probes. Our results show that the quality of models' internal\nrepresentations of the beliefs of others increases with model size and, more\ncrucially, with fine-tuning. We are the first to study how prompt variations\nimpact probing performance on theory of mind tasks. We demonstrate that models'\nrepresentations are sensitive to prompt variations, even when such variations\nshould be beneficial. Finally, we complement previous activation editing\nexperiments on Theory of Mind tasks and show that it is possible to improve\nmodels' reasoning performance by steering their activations without the need to\ntrain any probe.\n","authors":["Matteo Bortoletto","Constantin Ruhdorfer","Lei Shi","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2406.17513v2.pdf","comment":"ICML 2024 Workshop on Mechanistic Interpretability"},{"id":"http://arxiv.org/abs/2312.00738v2","updated":"2024-07-01T05:52:31Z","published":"2023-12-01T17:17:56Z","title":"SeaLLMs -- Large Language Models for Southeast Asia","summary":" Despite the remarkable achievements of large language models (LLMs) in\nvarious tasks, there remains a linguistic bias that favors high-resource\nlanguages, such as English, often at the expense of low-resource and regional\nlanguages. To address this imbalance, we introduce SeaLLMs, an innovative\nseries of language models that specifically focuses on Southeast Asian (SEA)\nlanguages. SeaLLMs are built upon the Llama-2 model and further advanced\nthrough continued pre-training with an extended vocabulary, specialized\ninstruction and alignment tuning to better capture the intricacies of regional\nlanguages. This allows them to respect and reflect local cultural norms,\ncustoms, stylistic preferences, and legal considerations. Our comprehensive\nevaluation demonstrates that SeaLLM-13b models exhibit superior performance\nacross a wide spectrum of linguistic tasks and assistant-style\ninstruction-following capabilities relative to comparable open-source models.\nMoreover, they outperform ChatGPT-3.5 in non-Latin languages, such as Thai,\nKhmer, Lao, and Burmese, by large margins while remaining lightweight and\ncost-effective to operate.\n","authors":["Xuan-Phi Nguyen","Wenxuan Zhang","Xin Li","Mahani Aljunied","Zhiqiang Hu","Chenhui Shen","Yew Ken Chia","Xingxuan Li","Jianyu Wang","Qingyu Tan","Liying Cheng","Guanzheng Chen","Yue Deng","Sen Yang","Chaoqun Liu","Hang Zhang","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2312.00738v2.pdf","comment":"Technical report, ACL 2024 DEMO TRACK"},{"id":"http://arxiv.org/abs/2406.18665v2","updated":"2024-07-01T05:38:08Z","published":"2024-06-26T18:10:22Z","title":"RouteLLM: Learning to Route LLMs with Preference Data","summary":" Large language models (LLMs) exhibit impressive capabilities across a wide\nrange of tasks, yet the choice of which model to use often involves a trade-off\nbetween performance and cost. More powerful models, though effective, come with\nhigher expenses, while less capable models are more cost-effective. To address\nthis dilemma, we propose several efficient router models that dynamically\nselect between a stronger and a weaker LLM during inference, aiming to optimize\nthe balance between cost and response quality. We develop a training framework\nfor these routers leveraging human preference data and data augmentation\ntechniques to enhance performance. Our evaluation on widely-recognized\nbenchmarks shows that our approach significantly reduces costs-by over 2 times\nin certain cases-without compromising the quality of responses. Interestingly,\nour router models also demonstrate significant transfer learning capabilities,\nmaintaining their performance even when the strong and weak models are changed\nat test time. This highlights the potential of these routers to provide a\ncost-effective yet high-performance solution for deploying LLMs.\n","authors":["Isaac Ong","Amjad Almahairi","Vincent Wu","Wei-Lin Chiang","Tianhao Wu","Joseph E. Gonzalez","M Waleed Kadous","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2406.18665v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09296v3","updated":"2024-07-01T03:38:57Z","published":"2023-06-15T17:20:46Z","title":"KoLA: Carefully Benchmarking World Knowledge of Large Language Models","summary":" The unprecedented performance of large language models (LLMs) necessitates\nimprovements in evaluations. Rather than merely exploring the breadth of LLM\nabilities, we believe meticulous and thoughtful designs are essential to\nthorough, unbiased, and applicable evaluations. Given the importance of world\nknowledge to LLMs, we construct a Knowledge-oriented LLM Assessment benchmark\n(KoLA), in which we carefully design three crucial factors: (1) For\n\\textbf{ability modeling}, we mimic human cognition to form a four-level\ntaxonomy of knowledge-related abilities, covering $19$ tasks. (2) For\n\\textbf{data}, to ensure fair comparisons, we use both Wikipedia, a corpus\nprevalently pre-trained by LLMs, along with continuously collected emerging\ncorpora, aiming to evaluate the capacity to handle unseen data and evolving\nknowledge. (3) For \\textbf{evaluation criteria}, we adopt a contrastive system,\nincluding overall standard scores for better numerical comparability across\ntasks and models and a unique self-contrast metric for automatically evaluating\nknowledge-creating ability. We evaluate $28$ open-source and commercial LLMs\nand obtain some intriguing findings. The KoLA dataset and open-participation\nleaderboard are publicly released at https://kola.xlore.cn and will be\ncontinuously updated to provide references for developing LLMs and\nknowledge-related systems.\n","authors":["Jifan Yu","Xiaozhi Wang","Shangqing Tu","Shulin Cao","Daniel Zhang-Li","Xin Lv","Hao Peng","Zijun Yao","Xiaohan Zhang","Hanming Li","Chunyang Li","Zheyuan Zhang","Yushi Bai","Yantao Liu","Amy Xin","Nianyi Lin","Kaifeng Yun","Linlu Gong","Jianhui Chen","Zhili Wu","Yunjia Qi","Weikai Li","Yong Guan","Kaisheng Zeng","Ji Qi","Hailong Jin","Jinxin Liu","Yu Gu","Yuan Yao","Ning Ding","Lei Hou","Zhiyuan Liu","Bin Xu","Jie Tang","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2306.09296v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2311.07138v2","updated":"2024-07-01T03:17:42Z","published":"2023-11-13T08:09:01Z","title":"WaterBench: Towards Holistic Evaluation of Watermarks for Large Language\n Models","summary":" To mitigate the potential misuse of large language models (LLMs), recent\nresearch has developed watermarking algorithms, which restrict the generation\nprocess to leave an invisible trace for watermark detection. Due to the\ntwo-stage nature of the task, most studies evaluate the generation and\ndetection separately, thereby presenting a challenge in unbiased, thorough, and\napplicable evaluations. In this paper, we introduce WaterBench, the first\ncomprehensive benchmark for LLM watermarks, in which we design three crucial\nfactors: (1) For benchmarking procedure, to ensure an apples-to-apples\ncomparison, we first adjust each watermarking method's hyper-parameter to reach\nthe same watermarking strength, then jointly evaluate their generation and\ndetection performance. (2) For task selection, we diversify the input and\noutput length to form a five-category taxonomy, covering $9$ tasks. (3) For\nevaluation metric, we adopt the GPT4-Judge for automatically evaluating the\ndecline of instruction-following abilities after watermarking. We evaluate $4$\nopen-source watermarks on $2$ LLMs under $2$ watermarking strengths and observe\nthe common struggles for current methods on maintaining the generation quality.\nThe code and data are available at https://github.com/THU-KEG/WaterBench.\n","authors":["Shangqing Tu","Yuliang Sun","Yushi Bai","Jifan Yu","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2311.07138v2.pdf","comment":"26pages, 7 figures, accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2402.00367v2","updated":"2024-07-01T01:34:09Z","published":"2024-02-01T06:11:49Z","title":"Don't Hallucinate, Abstain: Identifying LLM Knowledge Gaps via Multi-LLM\n Collaboration","summary":" Despite efforts to expand the knowledge of large language models (LLMs),\nknowledge gaps -- missing or outdated information in LLMs -- might always\npersist given the evolving nature of knowledge. In this work, we study\napproaches to identify LLM knowledge gaps and abstain from answering questions\nwhen knowledge gaps are present. We first adapt existing approaches to model\ncalibration or adaptation through fine-tuning/prompting and analyze their\nability to abstain from generating low-confidence outputs. Motivated by their\nfailures in self-reflection and over-reliance on held-out sets, we propose two\nnovel approaches that are based on model collaboration, i.e., LLMs probing\nother LLMs for knowledge gaps, either cooperatively or competitively. Extensive\nexperiments with three LLMs on four QA tasks featuring diverse knowledge\ndomains demonstrate that both cooperative and competitive approaches to\nunveiling LLM knowledge gaps achieve up to 19.3% improvements on abstain\naccuracy against the strongest baseline. Further analysis reveals that our\nproposed mechanisms could help identify failure cases in retrieval augmentation\nand pinpoint knowledge gaps in multi-hop reasoning.\n","authors":["Shangbin Feng","Weijia Shi","Yike Wang","Wenxuan Ding","Vidhisha Balachandran","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2402.00367v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.01853v1","updated":"2024-07-01T23:47:09Z","published":"2024-07-01T23:47:09Z","title":"Improving Multilingual Instruction Finetuning via Linguistically Natural\n and Diverse Datasets","summary":" Advancements in Large Language Models (LLMs) have significantly enhanced\ninstruction-following capabilities. However, most Instruction Fine-Tuning (IFT)\ndatasets are predominantly in English, limiting model performance in other\nlanguages. Traditional methods for creating multilingual IFT datasets such as\ntranslating existing English IFT datasets or converting existing NLP datasets\ninto IFT datasets by templating, struggle to capture linguistic nuances and\nensure prompt (instruction) diversity. To address this issue, we propose a\nnovel method for collecting multilingual IFT datasets that preserves linguistic\nnaturalness and ensures prompt diversity. This approach leverages\nEnglish-focused LLMs, monolingual corpora, and a scoring function to create\nhigh-quality, diversified IFT datasets in multiple languages. Experiments\ndemonstrate that LLMs finetuned using these IFT datasets show notable\nimprovements in both generative and discriminative tasks, indicating enhanced\nlanguage comprehension by LLMs in non-English contexts. Specifically, on the\nmultilingual summarization task, LLMs using our IFT dataset achieved 17.57% and\n15.23% improvements over LLMs fine-tuned with translation-based and\ntemplate-based datasets, respectively.\n","authors":["Sathish Reddy Indurthi","Wenxuan Zhou","Shamil Chollampatt","Ravi Agrawal","Kaiqiang Song","Lingxiao Zhao","Chenguang Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.01853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01850v1","updated":"2024-07-01T23:25:30Z","published":"2024-07-01T23:25:30Z","title":"Purple-teaming LLMs with Adversarial Defender Training","summary":" Existing efforts in safeguarding LLMs are limited in actively exposing the\nvulnerabilities of the target LLM and readily adapting to newly emerging safety\nrisks. To address this, we present Purple-teaming LLMs with Adversarial\nDefender training (PAD), a pipeline designed to safeguard LLMs by novelly\nincorporating the red-teaming (attack) and blue-teaming (safety training)\ntechniques. In PAD, we automatically collect conversational data that cover the\nvulnerabilities of an LLM around specific safety risks in a self-play manner,\nwhere the attacker aims to elicit unsafe responses and the defender generates\nsafe responses to these attacks. We then update both modules in a generative\nadversarial network style by training the attacker to elicit more unsafe\nresponses and updating the defender to identify them and explain the unsafe\nreason. Experimental results demonstrate that PAD significantly outperforms\nexisting baselines in both finding effective attacks and establishing a robust\nsafe guardrail. Furthermore, our findings indicate that PAD excels in striking\na balance between safety and overall model quality. We also reveal key\nchallenges in safeguarding LLMs, including defending multi-turn attacks and the\nneed for more delicate strategies to identify specific risks.\n","authors":["Jingyan Zhou","Kun Li","Junan Li","Jiawen Kang","Minda Hu","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2407.01850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11277v2","updated":"2024-07-01T23:23:41Z","published":"2024-05-18T12:26:31Z","title":"Action Controlled Paraphrasing","summary":" Recent studies have demonstrated the potential to control paraphrase\ngeneration, such as through syntax, which has broad applications in various\ndownstream tasks. However, these methods often require detailed parse trees or\nsyntactic exemplars, countering human-like paraphrasing behavior in language\nuse. Furthermore, an inference gap exists, as control specifications are only\navailable during training but not during inference. In this work, we propose a\nnew setup for controlled paraphrase generation. Specifically, we represent user\nintent as action tokens, embedding and concatenating them with text embeddings,\nthus flowing together into a self-attention encoder for representation fusion.\nTo address the inference gap, we introduce an optional action token as a\nplaceholder that encourages the model to determine the appropriate action\nindependently when users' intended actions are not provided. Experimental\nresults show that our method successfully enables precise action-controlled\nparaphrasing and preserves or even enhances performance compared to\nconventional uncontrolled methods when actions are not given. Our findings\npromote the concept of action-controlled paraphrasing for a more user-centered\ndesign.\n","authors":["Ning Shi","Zijun Wu"],"pdf_url":"https://arxiv.org/pdf/2405.11277v2.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2310.11324v2","updated":"2024-07-01T22:28:01Z","published":"2023-10-17T15:03:30Z","title":"Quantifying Language Models' Sensitivity to Spurious Features in Prompt\n Design or: How I learned to start worrying about prompt formatting","summary":" As large language models (LLMs) are adopted as a fundamental component of\nlanguage technologies, it is crucial to accurately characterize their\nperformance. Because choices in prompt design can strongly influence model\nbehavior, this design process is critical in effectively using any modern\npre-trained generative language model. In this work, we focus on LLM\nsensitivity to a quintessential class of meaning-preserving design choices:\nprompt formatting. We find that several widely used open-source LLMs are\nextremely sensitive to subtle changes in prompt formatting in few-shot\nsettings, with performance differences of up to 76 accuracy points when\nevaluated using LLaMA-2-13B. Sensitivity remains even when increasing model\nsize, the number of few-shot examples, or performing instruction tuning. Our\nanalysis suggests that work evaluating LLMs with prompting-based methods would\nbenefit from reporting a range of performance across plausible prompt formats,\ninstead of the currently-standard practice of reporting performance on a single\nformat. We also show that format performance only weakly correlates between\nmodels, which puts into question the methodological validity of comparing\nmodels with an arbitrarily chosen, fixed prompt format. To facilitate\nsystematic analysis we propose FormatSpread, an algorithm that rapidly\nevaluates a sampled set of plausible prompt formats for a given task, and\nreports the interval of expected performance without accessing model weights.\nFurthermore, we present a suite of analyses that characterize the nature of\nthis sensitivity, including exploring the influence of particular atomic\nperturbations and the internal representation of particular formats.\n","authors":["Melanie Sclar","Yejin Choi","Yulia Tsvetkov","Alane Suhr"],"pdf_url":"https://arxiv.org/pdf/2310.11324v2.pdf","comment":"ICLR 2024 Camera Ready version. With respect to the original\n submission, we added text generation experiments, plots of entire accuracy\n distributions for each task + stdev computations, and prompt length\n correlation with spread analysis"},{"id":"http://arxiv.org/abs/2407.01834v1","updated":"2024-07-01T22:17:17Z","published":"2024-07-01T22:17:17Z","title":"A Study of Nationality Bias in Names and Perplexity using Off-the-Shelf\n Affect-related Tweet Classifiers","summary":" In this paper, we apply a method to quantify biases associated with named\nentities from various countries. We create counterfactual examples with small\nperturbations on target-domain data instead of relying on templates or specific\ndatasets for bias detection. On widely used classifiers for subjectivity\nanalysis, including sentiment, emotion, hate speech, and offensive text using\nTwitter data, our results demonstrate positive biases related to the language\nspoken in a country across all classifiers studied. Notably, the presence of\ncertain country names in a sentence can strongly influence predictions, up to a\n23\\% change in hate speech detection and up to a 60\\% change in the prediction\nof negative emotions such as anger. We hypothesize that these biases stem from\nthe training data of pre-trained language models (PLMs) and find correlations\nbetween affect predictions and PLMs likelihood in English and unknown languages\nlike Basque and Maori, revealing distinct patterns with exacerbate\ncorrelations. Further, we followed these correlations in-between counterfactual\nexamples from a same sentence to remove the syntactical component, uncovering\ninteresting results suggesting the impact of the pre-training data was more\nimportant for English-speaking-country names. Our anonymized code is\n[https://anonymous.4open.science/r/biases_ppl-576B/README.md](available here).\n","authors":["Valentin Barriere","Sebastian Cifuentes"],"pdf_url":"https://arxiv.org/pdf/2407.01834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14373v2","updated":"2024-07-01T22:06:13Z","published":"2024-06-20T14:42:58Z","title":"Artificial Leviathan: Exploring Social Evolution of LLM Agents Through\n the Lens of Hobbesian Social Contract Theory","summary":" The emergence of Large Language Models (LLMs) and advancements in Artificial\nIntelligence (AI) offer an opportunity for computational social science\nresearch at scale. Building upon prior explorations of LLM agent design, our\nwork introduces a simulated agent society where complex social relationships\ndynamically form and evolve over time. Agents are imbued with psychological\ndrives and placed in a sandbox survival environment. We conduct an evaluation\nof the agent society through the lens of Thomas Hobbes's seminal Social\nContract Theory (SCT). We analyze whether, as the theory postulates, agents\nseek to escape a brutish \"state of nature\" by surrendering rights to an\nabsolute sovereign in exchange for order and security. Our experiments unveil\nan alignment: Initially, agents engage in unrestrained conflict, mirroring\nHobbes's depiction of the state of nature. However, as the simulation\nprogresses, social contracts emerge, leading to the authorization of an\nabsolute sovereign and the establishment of a peaceful commonwealth founded on\nmutual cooperation. This congruence between our LLM agent society's\nevolutionary trajectory and Hobbes's theoretical account indicates LLMs'\ncapability to model intricate social dynamics and potentially replicate forces\nthat shape human societies. By enabling such insights into group behavior and\nemergent societal phenomena, LLM-driven multi-agent simulations, while unable\nto simulate all the nuances of human behavior, may hold potential for advancing\nour understanding of social structures, group dynamics, and complex human\nsystems.\n","authors":["Gordon Dai","Weijia Zhang","Jinhan Li","Siqi Yang","Chidera Onochie lbe","Srihas Rao","Arthur Caetano","Misha Sra"],"pdf_url":"https://arxiv.org/pdf/2406.14373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15176v2","updated":"2024-07-01T22:06:00Z","published":"2023-09-26T18:19:51Z","title":"Robust Stance Detection: Understanding Public Perceptions in Social\n Media","summary":" The abundance of social media data has presented opportunities for accurately\ndetermining public and group-specific stances around policy proposals or\ncontroversial topics. In contrast with sentiment analysis which focuses on\nidentifying prevailing emotions, stance detection identifies precise positions\n(i.e., supportive, opposing, neutral) relative to a well-defined topic, such as\nperceptions toward specific global health interventions during the COVID-19\npandemic. Traditional stance detection models, while effective within their\nspecific domain (e.g., attitudes towards masking protocols during COVID-19),\noften lag in performance when applied to new domains and topics due to changes\nin data distribution. This limitation is compounded by the scarcity of\ndomain-specific, labeled datasets, which are expensive and labor-intensive to\ncreate. A solution we present in this paper combines counterfactual data\naugmentation with contrastive learning to enhance the robustness of stance\ndetection across domains and topics of interest. We evaluate the performance of\ncurrent state-of-the-art stance detection models, including a prompt-optimized\nlarge language model, relative to our proposed framework succinctly called\nSTANCE-C3 (domain-adaptive Cross-target STANCE detection via Contrastive\nlearning and Counterfactual generation). Empirical evaluations demonstrate\nSTANCE-C3's consistent improvements over the baseline models with respect to\naccuracy across domains and varying focal topics. Despite the increasing\nprevalence of general-purpose models such as generative AI, specialized models\nsuch as STANCE-C3 provide utility in safety-critical domains wherein precision\nis highly valued, especially when a nuanced understanding of the concerns of\ndifferent population segments could result in crafting more impactful public\npolicies.\n","authors":["Nayoung Kim","David Mosallanezhad","Lu Cheng","Michelle V. Mancenido","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2309.15176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01824v1","updated":"2024-07-01T21:46:30Z","published":"2024-07-01T21:46:30Z","title":"Empathic Grounding: Explorations using Multimodal Interaction and Large\n Language Models with Conversational Agents","summary":" We introduce the concept of \"empathic grounding\" in conversational agents as\nan extension of Clark's conceptualization of grounding in conversation in which\nthe grounding criterion includes listener empathy for the speaker's affective\nstate. Empathic grounding is generally required whenever the speaker's emotions\nare foregrounded and can make the grounding process more efficient and reliable\nby communicating both propositional and affective understanding. Both speaker\nexpressions of affect and listener empathic grounding can be multimodal,\nincluding facial expressions and other nonverbal displays. Thus, models of\nempathic grounding for embodied agents should be multimodal to facilitate\nnatural and efficient communication. We describe a multimodal model that takes\nas input user speech and facial expression to generate multimodal grounding\nmoves for a listening agent using a large language model. We also describe a\ntestbed to evaluate approaches to empathic grounding, in which a humanoid robot\ninterviews a user about a past episode of pain and then has the user rate their\nperception of the robot's empathy. We compare our proposed model to one that\nonly generates non-affective grounding cues in a between-subjects experiment.\nFindings demonstrate that empathic grounding increases user perceptions of\nempathy, understanding, emotional intelligence, and trust. Our work highlights\nthe role of emotion awareness and multimodality in generating appropriate\ngrounding moves for conversational agents.\n","authors":["Mehdi Arjmand","Farnaz Nouraei","Ian Steenstra","Timothy Bickmore"],"pdf_url":"https://arxiv.org/pdf/2407.01824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07237v3","updated":"2024-07-01T21:43:56Z","published":"2024-01-14T09:34:42Z","title":"Distilling Event Sequence Knowledge From Large Language Models","summary":" Event sequence models have been found to be highly effective in the analysis\nand prediction of events. Building such models requires availability of\nabundant high-quality event sequence data. In certain applications, however,\nclean structured event sequences are not available, and automated sequence\nextraction results in data that is too noisy and incomplete. In this work, we\nexplore the use of Large Language Models (LLMs) to generate event sequences\nthat can effectively be used for probabilistic event model construction. This\ncan be viewed as a mechanism of distilling event sequence knowledge from LLMs.\nOur approach relies on a Knowledge Graph (KG) of event concepts with partial\ncausal relations to guide the generative language model for causal event\nsequence generation. We show that our approach can generate high-quality event\nsequences, filling a knowledge gap in the input KG. Furthermore, we explore how\nthe generated sequences can be leveraged to discover useful and more complex\nstructured knowledge from pattern mining and probabilistic event models. We\nrelease our sequence generation code and evaluation framework, as well as\ncorpus of event sequence data.\n","authors":["Somin Wadhwa","Oktie Hassanzadeh","Debarun Bhattacharjya","Ken Barker","Jian Ni"],"pdf_url":"https://arxiv.org/pdf/2401.07237v3.pdf","comment":"In Proceedings of 23rd International Semantic Web Conference (ISWC),\n 2024"},{"id":"http://arxiv.org/abs/2407.01817v1","updated":"2024-07-01T21:34:51Z","published":"2024-07-01T21:34:51Z","title":"Race and Privacy in Broadcast Police Communications","summary":" Radios are essential for the operations of modern police departments, and\nthey function as both a collaborative communication technology and a\nsociotechnical system. However, little prior research has examined their usage\nor their connections to individual privacy and the role of race in policing,\ntwo growing topics of concern in the US. As a case study, we examine the\nChicago Police Department's (CPD's) use of broadcast police communications\n(BPC) to coordinate the activity of law enforcement officers (LEOs) in the\ncity. From a recently assembled archive of 80,775 hours of BPC associated with\nCPD operations, we analyze text transcripts of radio transmissions broadcast\n9:00 AM to 5:00 PM on August 10th, 2018 in one majority Black, one majority\nwhite, and one majority Hispanic area of the city (24 hours of audio) to\nexplore three research questions: (1) Do BPC reflect reported racial\ndisparities in policing? (2) How and when is gender, race/ethnicity, and age\nmentioned in BPC? (3) To what extent do BPC include sensitive information, and\nwho is put at most risk by this practice? (4) To what extent can large language\nmodels (LLMs) heighten this risk? We explore the vocabulary and speech acts\nused by police in BPC, comparing mentions of personal characteristics to local\ndemographics, the personal information shared over BPC, and the privacy\nconcerns that it poses. Analysis indicates (a) policing professionals in the\ncity of Chicago exhibit disproportionate attention to Black members of the\npublic regardless of context, (b) sociodemographic characteristics like gender,\nrace/ethnicity, and age are primarily mentioned in BPC about event information,\nand (c) disproportionate attention introduces disproportionate privacy risks\nfor Black members of the public.\n","authors":["Pranav Narayanan Venkit","Christopher Graziul","Miranda Ardith Goodman","Samantha Nicole Kenny","Shomir Wilson"],"pdf_url":"https://arxiv.org/pdf/2407.01817v1.pdf","comment":"Accepted in the 27th ACM Conference on Computer-Supported Cooperative\n Work and Social Computing (CSCW '24)"},{"id":"http://arxiv.org/abs/2311.09319v2","updated":"2024-07-01T21:08:18Z","published":"2023-11-15T19:25:29Z","title":"Spoken Word2Vec: Learning Skipgram Embeddings from Speech","summary":" Text word embeddings that encode distributional semantics work by modeling\ncontextual similarities of frequently occurring words. Acoustic word\nembeddings, on the other hand, typically encode low-level phonetic\nsimilarities. Semantic embeddings for spoken words have been previously\nexplored using analogous algorithms to Word2Vec, but the resulting vectors\nstill mainly encoded phonetic rather than semantic features. In this paper, we\nexamine the assumptions and architectures used in previous works and show\nexperimentally how shallow skipgram-like algorithms fail to encode\ndistributional semantics when the input units are acoustically correlated. We\nillustrate the potential of an alternative deep end-to-end variant of the model\nand examine the effects on the resulting embeddings, showing positive results\nof semantic relatedness in the embedding space.\n","authors":["Mohammad Amaan Sayeed","Hanan Aldarmaki"],"pdf_url":"https://arxiv.org/pdf/2311.09319v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01796v1","updated":"2024-07-01T20:47:47Z","published":"2024-07-01T20:47:47Z","title":"Ground Every Sentence: Improving Retrieval-Augmented LLMs with\n Interleaved Reference-Claim Generation","summary":" Retrieval-Augmented Generation (RAG) has been widely adopted to enhance Large\nLanguage Models (LLMs) in knowledge-intensive tasks. Recently, Attributed Text\nGeneration (ATG) has attracted growing attention, which provides citations to\nsupport the model's responses in RAG, so as to enhance the credibility of\nLLM-generated content and facilitate verification. Prior methods mainly adopt\ncoarse-grained attributions, linking to passage-level references or providing\nparagraph-level citations. However, these methods still fall short in\nverifiability and require certain time costs for fact checking. This paper\nproposes a fine-grained ATG method called ReClaim(Refer & Claim), which\nalternates the generation of references and answers step by step. Unlike\ntraditional coarse-grained attribution, ReClaim allows the model to add\nsentence-level fine-grained citations to each answer sentence in long-form\nquestion-answering tasks. Our experiments encompass various training and\ninference methods and multiple LLMs, verifying the effectiveness of our\napproach.\n","authors":["Sirui Xia","Xintao Wang","Jiaqing Liang","Yifei Zhang","Weikang Zhou","Jiaji Deng","Fei Yu","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.01796v1.pdf","comment":"15 pages,2 figures"},{"id":"http://arxiv.org/abs/2407.01784v1","updated":"2024-07-01T20:25:20Z","published":"2024-07-01T20:25:20Z","title":"Analyzing Persuasive Strategies in Meme Texts: A Fusion of Language\n Models with Paraphrase Enrichment","summary":" This paper describes our approach to hierarchical multi-label detection of\npersuasion techniques in meme texts. Our model, developed as a part of the\nrecent SemEval task, is based on fine-tuning individual language models (BERT,\nXLM-RoBERTa, and mBERT) and leveraging a mean-based ensemble model in addition\nto dataset augmentation through paraphrase generation from ChatGPT. The scope\nof the study encompasses enhancing model performance through innovative\ntraining techniques and data augmentation strategies. The problem addressed is\nthe effective identification and classification of multiple persuasive\ntechniques in meme texts, a task complicated by the diversity and complexity of\nsuch content. The objective of the paper is to improve detection accuracy by\nrefining model training methods and examining the impact of balanced versus\nunbalanced training datasets. Novelty in the results and discussion lies in the\nfinding that training with paraphrases enhances model performance, yet a\nbalanced training set proves more advantageous than a larger unbalanced one.\nAdditionally, the analysis reveals the potential pitfalls of indiscriminate\nincorporation of paraphrases from diverse distributions, which can introduce\nsubstantial noise. Results with the SemEval 2024 data confirm these insights,\ndemonstrating improved model efficacy with the proposed methods.\n","authors":["Kota Shamanth Ramanath Nayak","Leila Kosseim"],"pdf_url":"https://arxiv.org/pdf/2407.01784v1.pdf","comment":"15 pages, 8 figures, 1 table, Proceedings of 5th International\n Conference on Natural Language Processing and Applications (NLPA 2024)"},{"id":"http://arxiv.org/abs/2406.14307v2","updated":"2024-07-01T19:38:03Z","published":"2024-06-20T13:37:10Z","title":"QuST-LLM: Integrating Large Language Models for Comprehensive Spatial\n Transcriptomics Analysis","summary":" In this paper, we introduce QuST-LLM, an innovative extension of QuPath that\nutilizes the capabilities of large language models (LLMs) to analyze and\ninterpret spatial transcriptomics (ST) data. In addition to simplifying the\nintricate and high-dimensional nature of ST data by offering a comprehensive\nworkflow that includes data loading, region selection, gene expression\nanalysis, and functional annotation, QuST-LLM employs LLMs to transform complex\nST data into understandable and detailed biological narratives based on gene\nontology annotations, thereby significantly improving the interpretability of\nST data. Consequently, users can interact with their own ST data using natural\nlanguage. Hence, QuST-LLM provides researchers with a potent functionality to\nunravel the spatial and functional complexities of tissues, fostering novel\ninsights and advancements in biomedical research. QuST-LLM is a part of QuST\nproject. The source code is hosted on GitHub and documentation is available at\n(https://github.com/huangch/qust).\n","authors":["Chao Hui Huang"],"pdf_url":"https://arxiv.org/pdf/2406.14307v2.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.02765v2","updated":"2024-07-01T19:20:58Z","published":"2024-05-04T22:02:24Z","title":"Detecting Edited Knowledge in Language Models","summary":" Knowledge editing methods (KEs) can update language models' obsolete or\ninaccurate knowledge learned from pre-training. However, KEs can be used for\nmalicious applications, e.g., inserting misinformation and toxic content.\nKnowing whether a generated output is based on edited knowledge or first-hand\nknowledge from pre-training can increase users' trust in generative models and\nprovide more transparency. Driven by this, we propose a novel task: detecting\nedited knowledge in language models. Given an edited model and a fact retrieved\nby a prompt from an edited model, the objective is to classify the knowledge as\neither unedited (based on the pre-training), or edited (based on subsequent\nediting). We instantiate the task with four KEs, two LLMs, and two datasets.\nAdditionally, we propose using the hidden state representations and the\nprobability distributions as features for the detection. Our results reveal\nthat, using these features as inputs to a simple AdaBoost classifiers\nestablishes a strong baseline. This classifier requires only a limited amount\nof data and maintains its performance even in cross-domain settings. Last, we\nfind it more challenging to distinguish edited knowledge from unedited but\nrelated knowledge, highlighting the need for further research. Our work lays\nthe groundwork for addressing malicious model editing, which is a critical\nchallenge associated with the strong generative capabilities of LLMs.\n","authors":["Paul Youssef","Zhixue Zhao","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2405.02765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01725v1","updated":"2024-07-01T18:58:22Z","published":"2024-07-01T18:58:22Z","title":"DiscoveryBench: Towards Data-Driven Discovery with Large Language Models","summary":" Can the rapid advances in code generation, function calling, and data\nanalysis using large language models (LLMs) help automate the search and\nverification of hypotheses purely from a set of provided datasets? To evaluate\nthis question, we present DiscoveryBench, the first comprehensive benchmark\nthat formalizes the multi-step process of data-driven discovery. The benchmark\nis designed to systematically assess current model capabilities in discovery\ntasks and provide a useful resource for improving them. Our benchmark contains\n264 tasks collected across 6 diverse domains, such as sociology and\nengineering, by manually deriving discovery workflows from published papers to\napproximate the real-world challenges faced by researchers, where each task is\ndefined by a dataset, its metadata, and a discovery goal in natural language.\nWe additionally provide 903 synthetic tasks to conduct controlled evaluations\nacross task complexity. Furthermore, our structured formalism of data-driven\ndiscovery enables a facet-based evaluation that provides useful insights into\ndifferent failure modes. We evaluate several popular LLM-based reasoning\nframeworks using both open and closed LLMs as baselines on DiscoveryBench and\nfind that even the best system scores only 25%. Our benchmark, thus,\nillustrates the challenges in autonomous data-driven discovery and serves as a\nvaluable resource for the community to make progress.\n","authors":["Bodhisattwa Prasad Majumder","Harshit Surana","Dhruv Agarwal","Bhavana Dalvi Mishra","Abhijeetsingh Meena","Aryan Prakhar","Tirth Vora","Tushar Khot","Ashish Sabharwal","Peter Clark"],"pdf_url":"https://arxiv.org/pdf/2407.01725v1.pdf","comment":"Website: https://github.com/allenai/discoverybench"},{"id":"http://arxiv.org/abs/2406.00053v2","updated":"2024-07-01T18:23:43Z","published":"2024-05-28T21:38:20Z","title":"Dual Process Learning: Controlling Use of In-Context vs. In-Weights\n Strategies with Weight Forgetting","summary":" Language models have the ability to perform in-context learning (ICL),\nallowing them to flexibly adapt their behavior based on context. This contrasts\nwith in-weights learning, where information is statically encoded in model\nparameters from iterated observations of the data. Despite this apparent\nability to learn in-context, language models are known to struggle when faced\nwith unseen or rarely seen tokens. Hence, we study $\\textbf{structural\nin-context learning}$, which we define as the ability of a model to execute\nin-context learning on arbitrary tokens -- so called because the model must\ngeneralize on the basis of e.g. sentence structure or task structure, rather\nthan semantic content encoded in token embeddings. An ideal model would be able\nto do both: flexibly deploy in-weights operations (in order to robustly\naccommodate ambiguous or unknown contexts using encoded semantic information)\nand structural in-context operations (in order to accommodate novel tokens). We\nstudy structural in-context algorithms in a simple part-of-speech setting using\nboth practical and toy models. We find that active forgetting, a technique that\nwas recently introduced to help models generalize to new languages, forces\nmodels to adopt structural in-context learning solutions. Finally, we introduce\n$\\textbf{temporary forgetting}$, a straightforward extension of active\nforgetting that enables one to control how much a model relies on in-weights\nvs. in-context solutions. Importantly, temporary forgetting allows us to induce\na $\\textit{dual process strategy}$ where in-context and in-weights solutions\ncoexist within a single model.\n","authors":["Suraj Anand","Michael A. Lepori","Jack Merullo","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2406.00053v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03633v3","updated":"2024-07-01T18:13:24Z","published":"2023-12-06T17:29:45Z","title":"Exploring the Reversal Curse and Other Deductive Logical Reasoning in\n BERT and GPT-Based Large Language Models","summary":" The term \"Reversal Curse\" refers to the scenario where auto-regressive\ndecoder large language models (LLMs), such as ChatGPT, trained on \"A is B\" fail\nto learn \"B is A,\" assuming that B and A are distinct and can be uniquely\nidentified from each other, demonstrating a basic failure of logical deduction.\nThis raises a red flag in the use of GPT models for certain general tasks such\nas constructing knowledge graphs, considering their adherence to this symmetric\nprinciple. In our study, we examined a bidirectional LLM, BERT, and found that\nit is immune to the reversal curse. Driven by ongoing efforts to construct\nbiomedical knowledge graphs with LLMs, we also embarked on evaluating more\ncomplex but essential deductive reasoning capabilities. This process included\nfirst training encoder and decoder language models to master the intersection\nand union operations on two sets and then moving on to assess their capability\nto infer different combinations of union and intersection operations on three\nnewly created sets. The findings showed that while both encoder and decoder\nlanguage models, trained for tasks involving two sets (union/intersection),\nwere proficient in such scenarios, they encountered difficulties when dealing\nwith operations that included three sets (various combinations of union and\nintersection). Our research highlights the distinct characteristics of encoder\nand decoder models in simple and complex logical reasoning. In practice, the\nchoice between BERT and GPT should be guided by the specific requirements and\nnature of the task at hand, leveraging their respective strengths in\nbidirectional context comprehension and sequence prediction.\n","authors":["Da Wu","Jingye Yang","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03633v3.pdf","comment":"Final revision. To appear in Patterns"},{"id":"http://arxiv.org/abs/2407.01697v1","updated":"2024-07-01T18:08:17Z","published":"2024-07-01T18:08:17Z","title":"NLPGuard: A Framework for Mitigating the Use of Protected Attributes by\n NLP Classifiers","summary":" AI regulations are expected to prohibit machine learning models from using\nsensitive attributes during training. However, the latest Natural Language\nProcessing (NLP) classifiers, which rely on deep learning, operate as black-box\nsystems, complicating the detection and remediation of such misuse. Traditional\nbias mitigation methods in NLP aim for comparable performance across different\ngroups based on attributes like gender or race but fail to address the\nunderlying issue of reliance on protected attributes. To partly fix that, we\nintroduce NLPGuard, a framework for mitigating the reliance on protected\nattributes in NLP classifiers. NLPGuard takes an unlabeled dataset, an existing\nNLP classifier, and its training data as input, producing a modified training\ndataset that significantly reduces dependence on protected attributes without\ncompromising accuracy. NLPGuard is applied to three classification tasks:\nidentifying toxic language, sentiment analysis, and occupation classification.\nOur evaluation shows that current NLP classifiers heavily depend on protected\nattributes, with up to $23\\%$ of the most predictive words associated with\nthese attributes. However, NLPGuard effectively reduces this reliance by up to\n$79\\%$, while slightly improving accuracy.\n","authors":["Salvatore Greco","Ke Zhou","Licia Capra","Tania Cerquitelli","Daniele Quercia"],"pdf_url":"https://arxiv.org/pdf/2407.01697v1.pdf","comment":"Paper accepted at CSCW 2024"},{"id":"http://arxiv.org/abs/2407.01687v1","updated":"2024-07-01T18:01:07Z","published":"2024-07-01T18:01:07Z","title":"Deciphering the Factors Influencing the Efficacy of Chain-of-Thought:\n Probability, Memorization, and Noisy Reasoning","summary":" Chain-of-Thought (CoT) prompting has been shown to enhance the multi-step\nreasoning capabilities of Large Language Models (LLMs). However, debates\npersist about whether LLMs exhibit abstract generalization or rely on shallow\nheuristics when given CoT prompts. To understand the factors influencing CoT\nreasoning we provide a detailed case study of the symbolic reasoning task of\ndecoding shift ciphers, where letters are shifted forward some number of steps\nin the alphabet. GPT-4 achieves zero accuracy on most shift ciphers with\nstandard prompting, but with CoT its accuracy improves to an average of 32%. By\nfocusing on a single relatively simple task, we are able to identify three\nfactors that systematically affect CoT performance: the probability of the\ntask's expected output (probability), what the model has implicitly learned\nduring pre-training (memorization), and the number of intermediate operations\ninvolved in reasoning (noisy reasoning). We show that these factors can\ndrastically influence the task accuracy; e.g., varying the output's probability\nof occurrence can shift accuracy from 26% to 70%. We also demonstrate that it\nis essential for the model to explicitly produce intermediate steps as output\nthat can be conditioned on to increase the probability of the correct answer.\nOur experiments indicate that as long as the model does so, the validity of the\ndemonstrations in the prompt does not matter. Overall, we conclude that CoT\nprompting performance reflects both memorization and a probabilistic version of\ngenuine reasoning.\n","authors":["Akshara Prabhakar","Thomas L. Griffiths","R. Thomas McCoy"],"pdf_url":"https://arxiv.org/pdf/2407.01687v1.pdf","comment":"9 pages plus references and appendices"},{"id":"http://arxiv.org/abs/2407.01527v1","updated":"2024-07-01T17:59:47Z","published":"2024-07-01T17:59:47Z","title":"KV Cache Compression, But What Must We Give in Return? A Comprehensive\n Benchmark of Long Context Capable Approaches","summary":" Long context capability is a crucial competency for large language models\n(LLMs) as it mitigates the human struggle to digest long-form texts. This\ncapability enables complex task-solving scenarios such as book summarization,\ncode assistance, and many more tasks that are traditionally manpower-intensive.\nHowever, transformer-based LLMs face significant challenges with long context\ninput due to the growing size of the KV cache and the intrinsic complexity of\nattending to extended inputs; where multiple schools of efficiency-driven\napproaches -- such as KV cache quantization, token dropping, prompt\ncompression, linear-time sequence models, and hybrid architectures -- have been\nproposed to produce efficient yet long context-capable models. Despite these\nadvancements, no existing work has comprehensively benchmarked these methods in\na reasonably aligned environment. In this work, we fill this gap by providing a\ntaxonomy of current methods and evaluating 10+ state-of-the-art approaches\nacross seven categories of long context tasks. Our work reveals numerous\npreviously unknown phenomena and offers insights -- as well as a friendly\nworkbench -- for the future development of long context-capable LLMs. The\nsource code will be available at https://github.com/henryzhongsc/longctx_bench\n","authors":["Jiayi Yuan","Hongyi Liu"," Shaochen"," Zhong","Yu-Neng Chuang","Songchen Li","Guanchu Wang","Duy Le","Hongye Jin","Vipin Chaudhary","Zhaozhuo Xu","Zirui Liu","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2407.01527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01523v1","updated":"2024-07-01T17:59:26Z","published":"2024-07-01T17:59:26Z","title":"MMLongBench-Doc: Benchmarking Long-context Document Understanding with\n Visualizations","summary":" Understanding documents with rich layouts and multi-modal components is a\nlong-standing and practical task. Recent Large Vision-Language Models (LVLMs)\nhave made remarkable strides in various tasks, particularly in single-page\ndocument understanding (DU). However, their abilities on long-context DU remain\nan open problem. This work presents MMLongBench-Doc, a long-context,\nmulti-modal benchmark comprising 1,062 expert-annotated questions. Distinct\nfrom previous datasets, it is constructed upon 130 lengthy PDF-formatted\ndocuments with an average of 49.4 pages and 20,971 textual tokens. Towards\ncomprehensive evaluation, answers to these questions rely on pieces of evidence\nfrom (1) different sources (text, image, chart, table, and layout structure)\nand (2) various locations (i.e. page number). Moreover, 33.2% of the questions\nare cross-page questions requiring evidence across multiple pages. 22.8% of the\nquestions are designed to be unanswerable for detecting potential\nhallucinations. Experiments on 14 LVLMs demonstrate that long-context DU\ngreatly challenges current models. Notably, the best-performing model, GPT-4o,\nachieves an F1 score of only 42.7%, while the second-best, GPT-4V, scores\n31.4%. Furthermore, 12 LVLMs (all except GPT-4o and GPT-4V) even present worse\nperformance than their LLM counterparts which are fed with lossy-parsed OCR\ndocuments. These results validate the necessity of future research toward more\ncapable long-context LVLMs. Project Page:\nhttps://mayubo2333.github.io/MMLongBench-Doc\n","authors":["Yubo Ma","Yuhang Zang","Liangyu Chen","Meiqi Chen","Yizhu Jiao","Xinze Li","Xinyuan Lu","Ziyu Liu","Yan Ma","Xiaoyi Dong","Pan Zhang","Liangming Pan","Yu-Gang Jiang","Jiaqi Wang","Yixin Cao","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01523v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.01318v2","updated":"2024-07-01T17:58:21Z","published":"2023-07-31T16:04:17Z","title":"Framing image registration as a landmark detection problem for\n label-noise-aware task representation (HitR)","summary":" Accurate image registration is pivotal in biomedical image analysis, where\nselecting suitable registration algorithms demands careful consideration. While\nnumerous algorithms are available, the evaluation metrics to assess their\nperformance have remained relatively static. This study addresses this\nchallenge by introducing a novel evaluation metric termed Landmark Hit Rate\n(HitR), which focuses on the clinical relevance of image registration accuracy.\nUnlike traditional metrics such as Target Registration Error, which emphasize\nsubresolution differences, HitR considers whether registration algorithms\nsuccessfully position landmarks within defined confidence zones. This paradigm\nshift acknowledges the inherent annotation noise in medical images, allowing\nfor more meaningful assessments. To equip HitR with label-noise-awareness, we\npropose defining these confidence zones based on an Inter-rater Variance\nanalysis. Consequently, hit rate curves are computed for varying landmark zone\nsizes, enabling performance measurement for a task-specific level of accuracy.\nOur approach offers a more realistic and meaningful assessment of image\nregistration algorithms, reflecting their suitability for clinical and\nbiomedical applications.\n","authors":["Diana Waldmannstetter","Ivan Ezhov","Benedikt Wiestler","Francesco Campi","Ivan Kukuljan","Stefan Ehrlich","Shankeeth Vinayahalingam","Bhakti Baheti","Satrajit Chakrabarty","Ujjwal Baid","Spyridon Bakas","Julian Schwarting","Marie Metz","Jan S. Kirschke","Daniel Rueckert","Rolf A. Heckemann","Marie Piraud","Bjoern H. Menze","Florian Kofler"],"pdf_url":"https://arxiv.org/pdf/2308.01318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09233v3","updated":"2024-07-01T17:48:16Z","published":"2023-07-18T13:10:11Z","title":"Distilling Knowledge from Text-to-Image Generative Models Improves\n Visio-Linguistic Reasoning in CLIP","summary":" Image-text contrastive models like CLIP have wide applications in zero-shot\nclassification, image-text retrieval, and transfer learning. However, they\noften struggle on compositional visio-linguistic tasks (e.g., attribute-binding\nor object-relationships) where their performance is no better than random\nchance. To address this, we introduce SDS-CLIP, a lightweight and\nsample-efficient distillation method to enhance CLIP's compositional\nvisio-linguistic reasoning. Our approach fine-tunes CLIP using a distillation\nobjective borrowed from large text-to-image generative models like\nStable-Diffusion, which are known for their strong visio-linguistic reasoning\nabilities. On the challenging Winoground benchmark, SDS-CLIP improves the\nvisio-linguistic performance of various CLIP models by up to 7%, while on the\nARO dataset, it boosts performance by up to 3%. This work underscores the\npotential of well-designed distillation objectives from generative models to\nenhance contrastive image-text models with improved visio-linguistic reasoning\ncapabilities.\n","authors":["Samyadeep Basu","Shell Xu Hu","Maziar Sanjabi","Daniela Massiceti","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2307.09233v3.pdf","comment":"Short paper"},{"id":"http://arxiv.org/abs/2308.13320v3","updated":"2024-07-01T17:14:27Z","published":"2023-08-25T11:49:51Z","title":"Fine-tuning can cripple your foundation model; preserving features may\n be the solution","summary":" Pre-trained foundation models, due to their enormous capacity and exposure to\nvast amounts of data during pre-training, are known to have learned plenty of\nreal-world concepts. An important step in making these pre-trained models\neffective on downstream tasks is to fine-tune them on related datasets. While\nvarious fine-tuning methods have been devised and have been shown to be highly\neffective, we observe that a fine-tuned model's ability to recognize concepts\non tasks $\\textit{different}$ from the downstream one is reduced significantly\ncompared to its pre-trained counterpart. This is an undesirable effect of\nfine-tuning as a substantial amount of resources was used to learn these\npre-trained concepts in the first place. We call this phenomenon ''concept\nforgetting'' and via experiments show that most end-to-end fine-tuning\napproaches suffer heavily from this side effect. To this end, we propose a\nsimple fix to this problem by designing a new fine-tuning method called\n$\\textit{LDIFS}$ (short for $\\ell_2$ distance in feature space) that, while\nlearning new concepts related to the downstream task, allows a model to\npreserve its pre-trained knowledge as well. Through extensive experiments on 10\nfine-tuning tasks we show that $\\textit{LDIFS}$ significantly reduces concept\nforgetting. Additionally, we show that LDIFS is highly effective in performing\ncontinual fine-tuning on a sequence of tasks as well, in comparison with both\nfine-tuning as well as continual learning baselines.\n","authors":["Jishnu Mukhoti","Yarin Gal","Philip H. S. Torr","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2308.13320v3.pdf","comment":"Published in TMLR: https://openreview.net/forum?id=kfhoeZCeW7"},{"id":"http://arxiv.org/abs/2311.02115v2","updated":"2024-07-01T16:30:53Z","published":"2023-11-03T01:37:28Z","title":"Towards objective and systematic evaluation of bias in artificial\n intelligence for medical imaging","summary":" Artificial intelligence (AI) models trained using medical images for clinical\ntasks often exhibit bias in the form of disparities in performance between\nsubgroups. Since not all sources of biases in real-world medical imaging data\nare easily identifiable, it is challenging to comprehensively assess how those\nbiases are encoded in models, and how capable bias mitigation methods are at\nameliorating performance disparities. In this article, we introduce a novel\nanalysis framework for systematically and objectively investigating the impact\nof biases in medical images on AI models. We developed and tested this\nframework for conducting controlled in silico trials to assess bias in medical\nimaging AI using a tool for generating synthetic magnetic resonance images with\nknown disease effects and sources of bias. The feasibility is showcased by\nusing three counterfactual bias scenarios to measure the impact of simulated\nbias effects on a convolutional neural network (CNN) classifier and the\nefficacy of three bias mitigation strategies. The analysis revealed that the\nsimulated biases resulted in expected subgroup performance disparities when the\nCNN was trained on the synthetic datasets. Moreover, reweighing was identified\nas the most successful bias mitigation strategy for this setup, and we\ndemonstrated how explainable AI methods can aid in investigating the\nmanifestation of bias in the model using this framework. Developing fair AI\nmodels is a considerable challenge given that many and often unknown sources of\nbiases can be present in medical imaging datasets. In this work, we present a\nnovel methodology to objectively study the impact of biases and mitigation\nstrategies on deep learning pipelines, which can support the development of\nclinical AI that is robust and responsible.\n","authors":["Emma A. M. Stanley","Raissa Souza","Anthony Winder","Vedant Gulve","Kimberly Amador","Matthias Wilms","Nils D. Forkert"],"pdf_url":"https://arxiv.org/pdf/2311.02115v2.pdf","comment":"Published in the Journal of the American Medical Informatics\n Association"},{"id":"http://arxiv.org/abs/2406.14220v2","updated":"2024-07-01T16:30:23Z","published":"2024-06-20T11:40:12Z","title":"Evaluation of Deep Learning Semantic Segmentation for Land Cover Mapping\n on Multispectral, Hyperspectral and High Spatial Aerial Imagery","summary":" In the rise of climate change, land cover mapping has become such an urgent\nneed in environmental monitoring. The accuracy of land cover classification has\ngotten increasingly based on the improvement of remote sensing data. Land cover\nclassification using satellite imageries has been explored and become more\nprevalent in recent years, but the methodologies remain some drawbacks of\nsubjective and time-consuming. Some deep learning techniques have been utilized\nto overcome these limitations. However, most studies implemented just one image\ntype to evaluate algorithms for land cover mapping. Therefore, our study\nconducted deep learning semantic segmentation in multispectral, hyperspectral,\nand high spatial aerial image datasets for landcover mapping. This research\nimplemented a semantic segmentation method such as Unet, Linknet, FPN, and\nPSPnet for categorizing vegetation, water, and others (i.e., soil and\nimpervious surface). The LinkNet model obtained high accuracy in IoU\n(Intersection Over Union) at 0.92 in all datasets, which is comparable with\nother mentioned techniques. In evaluation with different image types, the\nmultispectral images showed higher performance with the IoU, and F1-score are\n0.993 and 0.997, respectively. Our outcome highlighted the efficiency and broad\napplicability of LinkNet and multispectral image on land cover classification.\nThis research contributes to establishing an approach on landcover segmentation\nvia open source for long-term future application.\n","authors":["Ilham Adi Panuntun","Ying-Nong Chen","Ilham Jamaluddin","Thi Linh Chi Tran"],"pdf_url":"https://arxiv.org/pdf/2406.14220v2.pdf","comment":"conference, This preprint is based on the following published\n conference article: Panuntun, I. A., Chen, Y.-N., Jamaluddin, I., & Tran, T.\n L. C., 2023. Evaluation of Deep Learning Semantic Segmentation for Land Cover\n Mapping on Multispectral, Hyperspectral and High Spatial Aerial Imagery. 44th\n Asian Conference on Remote Sensing, ACRS 2023. Code 198676"},{"id":"http://arxiv.org/abs/2306.00238v2","updated":"2024-07-01T15:54:17Z","published":"2023-05-31T23:18:21Z","title":"Bytes Are All You Need: Transformers Operating Directly On File Bytes","summary":" Modern deep learning approaches usually utilize modality-specific processing.\nFor example, the most common deep learning approach to image classification\ninvolves decoding image file bytes into an RGB tensor which is passed into a\nneural network. Instead, we investigate modality-independent representation\nlearning by performing classification directly on file bytes, without the need\nfor decoding files at inference time. This enables models to operate on various\nmodalities without any hand-designed, modality-specific processing. Our model,\nByteFormer, improves ImageNet Top-1 classification accuracy by $5\\%$ (from\n$72.2\\%$ to $77.33\\%$) relative to DeIT models of similar size. Compared to\nPerceiver IO, our model requires absolutely no modality-specific processing at\ninference time, and uses an order of magnitude fewer parameters at equivalent\naccuracy on ImageNet. We demonstrate that the same ByteFormer architecture can\nperform audio classification without modifications or modality-specific\npreprocessing. We achieve $95.42\\%$ classification accuracy on the Speech\nCommands V2 dataset (comparable to the state-of-the-art accuracy of $98.7\\%$).\nAdditionally, we demonstrate that ByteFormer can operate jointly on images and\naudio, handling joint classification without explicit knowledge of the input\nmodality. We release our code at\nhttps://github.com/apple/corenet/tree/main/projects/byteformer.\n","authors":["Maxwell Horton","Sachin Mehta","Ali Farhadi","Mohammad Rastegari"],"pdf_url":"https://arxiv.org/pdf/2306.00238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12797v3","updated":"2024-07-01T15:46:25Z","published":"2024-02-20T08:14:53Z","title":"A Geometric Algorithm for Tubular Shape Reconstruction from Skeletal\n Representation","summary":" We introduce a novel approach for the reconstruction of tubular shapes from\nskeletal representations. Our method processes all skeletal points as a whole,\neliminating the need for splitting input structure into multiple segments. We\nrepresent the tubular shape as a truncated signed distance function (TSDF) in a\nvoxel hashing manner, in which the signed distance between a voxel center and\nthe object is computed through a simple geometric algorithm. Our method does\nnot involve any surface sampling scheme or solving large matrix equations, and\ntherefore is a faster and more elegant solution for tubular shape\nreconstruction compared to other approaches. Experiments demonstrate the\nefficiency and effectiveness of the proposed method. Code is avaliable at\nhttps://github.com/wlsdzyzl/Dragon.\n","authors":["Guoqing Zhang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2402.12797v3.pdf","comment":"9 pages (without reference), 6 figures"},{"id":"http://arxiv.org/abs/2303.09100v2","updated":"2024-07-01T15:29:45Z","published":"2023-03-16T06:09:15Z","title":"Patch-Prompt Aligned Bayesian Prompt Tuning for Vision-Language Models","summary":" For downstream applications of vision-language pre-trained models, there has\nbeen significant interest in constructing effective prompts. Existing works on\nprompt engineering, which either require laborious manual designs or optimize\nthe prompt tuning as a point estimation problem, may fail to describe diverse\ncharacteristics of categories and limit their applications. We introduce a\nBayesian probabilistic resolution to prompt tuning, where the label-specific\nstochastic prompts are generated hierarchically by first sampling a latent\nvector from an underlying distribution and then employing a lightweight\ngenerative model. Importantly, we semantically regularize the tuning process by\nminimizing the statistical distance between the visual patches and linguistic\nprompts, which pushes the stochastic label representations to faithfully\ncapture diverse visual concepts, instead of overfitting the training\ncategories. We evaluate the effectiveness of our approach on four tasks:\nfew-shot image recognition, base-to-new generalization, dataset transfer\nlearning, and domain shifts. Extensive results over 15 datasets show promising\ntransferability and generalization performance of our proposed model, both\nquantitatively and qualitatively.\n","authors":["Xinyang Liu","Dongsheng Wang","Bowei Fang","Miaoge Li","Zhibin Duan","Yishi Xu","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.09100v2.pdf","comment":"Accepted by UAI 2024"},{"id":"http://arxiv.org/abs/2403.08477v3","updated":"2024-07-01T15:29:16Z","published":"2024-03-13T12:46:03Z","title":"Unleashing the Power of Meta-tuning for Few-shot Generalization Through\n Sparse Interpolated Experts","summary":" Recent successes suggest that parameter-efficient fine-tuning of foundation\nmodels as the state-of-the-art method for transfer learning in vision,\nreplacing the rich literature of alternatives such as meta-learning. In trying\nto harness the best of both worlds, meta-tuning introduces a subsequent\noptimization stage of foundation models but has so far only shown limited\nsuccess and crucially tends to underperform on out-of-distribution (OOD) tasks.\nIn this paper, we introduce Sparse MetA-Tuning (SMAT), a method inspired by\nsparse mixture-of-experts approaches and trained to isolate subsets of\npre-trained parameters automatically for meta-tuning on each task. SMAT\nsuccessfully overcomes OOD sensitivity and delivers on the promise of enhancing\nthe transfer abilities of vision foundation models beyond parameter-efficient\nfine-tuning. We establish new state-of-the-art results on a challenging\ncombination of Meta-Dataset augmented with additional OOD tasks in both\nzero-shot and gradient-based adaptation settings. In addition, we provide a\nthorough analysis of the superiority of learned over hand-designed sparsity\npatterns for sparse expert methods and the pivotal importance of the sparsity\nlevel in balancing between in-distribution and out-of-distribution\ngeneralization. Our code is publicly available.\n","authors":["Shengzhuang Chen","Jihoon Tack","Yunqiao Yang","Yee Whye Teh","Jonathan Richard Schwarz","Ying Wei"],"pdf_url":"https://arxiv.org/pdf/2403.08477v3.pdf","comment":"The Forty-first International Conference on Machine Learning, 2024"},{"id":"http://arxiv.org/abs/2401.08174v3","updated":"2024-07-01T15:16:02Z","published":"2024-01-16T07:33:22Z","title":"An Efficient Instance Segmentation Framework Based on Oriented Bounding\n Boxes","summary":" Instance segmentation for completely occluded objects and dense objects in\nrobot vision measurement are two challenging tasks. To uniformly deal with\nthem, this paper proposes a unified coarse-to-fine instance segmentation\nframework, CFNet, which uses box prompt-based segmentation foundation models\n(BSMs), e.g., Segment Anything Model. Specifically, CFNet first detects\noriented bounding boxes (OBBs) to distinguish instances and provide coarse\nlocalization information. Then, it predicts OBB prompt-related masks for fine\nsegmentation. CFNet performs instance segmentation with OBBs that only contain\npartial object boundaries on occluders to predict occluded object instances,\nwhich overcomes the difficulty of existing amodal instance segmentation methods\nin directly predicting occluded objects. In addition, since OBBs only serve as\nprompts, CFNet alleviates the over-dependence on bounding box detection\nperformance of current instance segmentation methods using OBBs for dense\nobjects. Moreover, to enable BSMs to handle OBB prompts, we propose a novel OBB\nprompt encoder. To make CFNet more lightweight, we perform knowledge\ndistillation on it and introduce a Gaussian label smoothing method for teacher\nmodel outputs. Experiments demonstrate that CFNet outperforms current instance\nsegmentation methods on both industrial and public datasets. The code is\navailable at https://github.com/zhen6618/OBBInstanceSegmentation.\n","authors":["Zhen Zhou","Junfeng Fan","Yunkai Ma","Sihan Zhao","Fengshui Jing","Min Tan"],"pdf_url":"https://arxiv.org/pdf/2401.08174v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14676v2","updated":"2024-07-01T14:43:15Z","published":"2024-04-23T02:04:53Z","title":"DreamPBR: Text-driven Generation of High-resolution SVBRDF with\n Multi-modal Guidance","summary":" Prior material creation methods had limitations in producing diverse results\nmainly because reconstruction-based methods relied on real-world measurements\nand generation-based methods were trained on relatively small material\ndatasets. To address these challenges, we propose DreamPBR, a novel\ndiffusion-based generative framework designed to create spatially-varying\nappearance properties guided by text and multi-modal controls, providing high\ncontrollability and diversity in material generation. Key to achieving diverse\nand high-quality PBR material generation lies in integrating the capabilities\nof recent large-scale vision-language models trained on billions of text-image\npairs, along with material priors derived from hundreds of PBR material\nsamples. We utilize a novel material Latent Diffusion Model (LDM) to establish\nthe mapping between albedo maps and the corresponding latent space. The latent\nrepresentation is then decoded into full SVBRDF parameter maps using a\nrendering-aware PBR decoder. Our method supports tileable generation through\nconvolution with circular padding. Furthermore, we introduce a multi-modal\nguidance module, which includes pixel-aligned guidance, style image guidance,\nand 3D shape guidance, to enhance the control capabilities of the material LDM.\nWe demonstrate the effectiveness of DreamPBR in material creation, showcasing\nits versatility and user-friendliness on a wide range of controllable\ngeneration and editing applications.\n","authors":["Linxuan Xin","Zheng Zhang","Jinfu Wei","Wei Gao","Duan Gao"],"pdf_url":"https://arxiv.org/pdf/2404.14676v2.pdf","comment":"16 pages, 17 figures"},{"id":"http://arxiv.org/abs/2406.04031v2","updated":"2024-07-01T14:25:23Z","published":"2024-06-06T13:00:42Z","title":"Jailbreak Vision Language Models via Bi-Modal Adversarial Prompt","summary":" In the realm of large vision language models (LVLMs), jailbreak attacks serve\nas a red-teaming approach to bypass guardrails and uncover safety implications.\nExisting jailbreaks predominantly focus on the visual modality, perturbing\nsolely visual inputs in the prompt for attacks. However, they fall short when\nconfronted with aligned models that fuse visual and textual features\nsimultaneously for generation. To address this limitation, this paper\nintroduces the Bi-Modal Adversarial Prompt Attack (BAP), which executes\njailbreaks by optimizing textual and visual prompts cohesively. Initially, we\nadversarially embed universally harmful perturbations in an image, guided by a\nfew-shot query-agnostic corpus (e.g., affirmative prefixes and negative\ninhibitions). This process ensures that image prompt LVLMs to respond\npositively to any harmful queries. Subsequently, leveraging the adversarial\nimage, we optimize textual prompts with specific harmful intent. In particular,\nwe utilize a large language model to analyze jailbreak failures and employ\nchain-of-thought reasoning to refine textual prompts through a\nfeedback-iteration manner. To validate the efficacy of our approach, we\nconducted extensive evaluations on various datasets and LVLMs, demonstrating\nthat our method significantly outperforms other methods by large margins\n(+29.03% in attack success rate on average). Additionally, we showcase the\npotential of our attacks on black-box commercial LVLMs, such as Gemini and\nChatGLM.\n","authors":["Zonghao Ying","Aishan Liu","Tianyuan Zhang","Zhengmin Yu","Siyuan Liang","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2406.04031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00440v2","updated":"2024-07-01T14:15:37Z","published":"2024-06-01T13:37:51Z","title":"Topo4D: Topology-Preserving Gaussian Splatting for High-Fidelity 4D Head\n Capture","summary":" 4D head capture aims to generate dynamic topological meshes and corresponding\ntexture maps from videos, which is widely utilized in movies and games for its\nability to simulate facial muscle movements and recover dynamic textures in\npore-squeezing. The industry often adopts the method involving multi-view\nstereo and non-rigid alignment. However, this approach is prone to errors and\nheavily reliant on time-consuming manual processing by artists. To simplify\nthis process, we propose Topo4D, a novel framework for automatic geometry and\ntexture generation, which optimizes densely aligned 4D heads and 8K texture\nmaps directly from calibrated multi-view time-series images. Specifically, we\nfirst represent the time-series faces as a set of dynamic 3D Gaussians with\nfixed topology in which the Gaussian centers are bound to the mesh vertices.\nAfterward, we perform alternative geometry and texture optimization\nframe-by-frame for high-quality geometry and texture learning while maintaining\ntemporal topology stability. Finally, we can extract dynamic facial meshes in\nregular wiring arrangement and high-fidelity textures with pore-level details\nfrom the learned Gaussians. Extensive experiments show that our method achieves\nsuperior results than the current SOTA face reconstruction methods both in the\nquality of meshes and textures. Project page:\nhttps://xuanchenli.github.io/Topo4D/.\n","authors":["Xuanchen Li","Yuhao Cheng","Xingyu Ren","Haozhe Jia","Di Xu","Wenhan Zhu","Yichao Yan"],"pdf_url":"https://arxiv.org/pdf/2406.00440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17851v2","updated":"2024-07-01T14:06:26Z","published":"2024-01-31T14:13:01Z","title":"Instruction-Guided Scene Text Recognition","summary":" Multi-modal models show appealing performance in visual recognition tasks\nrecently, as free-form text-guided training evokes the ability to understand\nfine-grained visual content. However, current models are either inefficient or\ncannot be trivially upgraded to scene text recognition (STR) due to the\ncomposition difference between natural and text images. We propose a novel\ninstruction-guided scene text recognition (IGTR) paradigm that formulates STR\nas an instruction learning problem and understands text images by predicting\ncharacter attributes, e.g., character frequency, position, etc. IGTR first\ndevises $\\left \\langle condition,question,answer\\right \\rangle$ instruction\ntriplets, providing rich and diverse descriptions of character attributes. To\neffectively learn these attributes through question-answering, IGTR develops\nlightweight instruction encoder, cross-modal feature fusion module and\nmulti-task answer head, which guides nuanced text image understanding.\nFurthermore, IGTR realizes different recognition pipelines simply by using\ndifferent instructions, enabling a character-understanding-based text reasoning\nparadigm that considerably differs from current methods. Experiments on English\nand Chinese benchmarks show that IGTR outperforms existing models by\nsignificant margins, while maintaining a small model size and efficient\ninference speed. Moreover, by adjusting the sampling of instructions, IGTR\noffers an elegant way to tackle the recognition of both rarely appearing and\nmorphologically similar characters, which were previous challenges. Code at\n\\href{https://github.com/Topdu/OpenOCR}{this http URL}.\n","authors":["Yongkun Du","Zhineng Chen","Yuchen Su","Caiyan Jia","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.17851v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04821v3","updated":"2024-07-01T13:50:35Z","published":"2022-09-11T09:43:42Z","title":"Local-Aware Global Attention Network for Person Re-Identification Based\n on Body and Hand Images","summary":" Learning representative, robust and discriminative information from images is\nessential for effective person re-identification (Re-Id). In this paper, we\npropose a compound approach for end-to-end discriminative deep feature learning\nfor person Re-Id based on both body and hand images. We carefully design the\nLocal-Aware Global Attention Network (LAGA-Net), a multi-branch deep network\narchitecture consisting of one branch for spatial attention, one branch for\nchannel attention, one branch for global feature representations and another\nbranch for local feature representations. The attention branches focus on the\nrelevant features of the image while suppressing the irrelevant backgrounds. In\norder to overcome the weakness of the attention mechanisms, equivariant to\npixel shuffling, we integrate relative positional encodings into the spatial\nattention module to capture the spatial positions of pixels. The global branch\nintends to preserve the global context or structural information. For the the\nlocal branch, which intends to capture the fine-grained information, we perform\nuniform partitioning to generate stripes on the conv-layer horizontally. We\nretrieve the parts by conducting a soft partition without explicitly\npartitioning the images or requiring external cues such as pose estimation. A\nset of ablation study shows that each component contributes to the increased\nperformance of the LAGA-Net. Extensive evaluations on four popular body-based\nperson Re-Id benchmarks and two publicly available hand datasets demonstrate\nthat our proposed method consistently outperforms existing state-of-the-art\nmethods.\n","authors":["Nathanael L. Baisa"],"pdf_url":"https://arxiv.org/pdf/2209.04821v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2108.02234"},{"id":"http://arxiv.org/abs/2312.01758v2","updated":"2024-07-01T13:31:40Z","published":"2023-12-04T09:35:36Z","title":"CILF-CIAE: CLIP-driven Image-Language Fusion for Correcting Inverse Age\n Estimation","summary":" The age estimation task aims to predict the age of an individual by analyzing\nfacial features in an image. The development of age estimation can improve the\nefficiency and accuracy of various applications (e.g., age verification and\nsecure access control, etc.). In recent years, contrastive language-image\npre-training (CLIP) has been widely used in various multimodal tasks and has\nmade some progress in the field of age estimation. However, existing CLIP-based\nage estimation methods require high memory usage (quadratic complexity) when\nglobally modeling images, and lack an error feedback mechanism to prompt the\nmodel about the quality of age prediction results. To tackle the above issues,\nwe propose a novel CLIP-driven Image-Language Fusion for Correcting Inverse Age\nEstimation (CILF-CIAE). Specifically, we first introduce the CLIP model to\nextract image features and text semantic information respectively, and map them\ninto a highly semantically aligned high-dimensional feature space. Next, we\ndesigned a new Transformer architecture (i.e., FourierFormer) to achieve\nchannel evolution and spatial interaction of images, and to fuse image and text\nsemantic information. Compared with the quadratic complexity of the attention\nmechanism, the proposed Fourierformer is of linear log complexity. To further\nnarrow the semantic gap between image and text features, we utilize an\nefficient contrastive multimodal learning module that supervises the multimodal\nfusion process of FourierFormer through contrastive loss for image-text\nmatching, thereby improving the interaction effect between different\nmodalities. Finally, we introduce reversible age estimation, which uses\nend-to-end error feedback to reduce the error rate of age predictions. Through\nextensive experiments on multiple data sets, CILF-CIAE has achieved better age\nprediction results.\n","authors":["Yuntao Shou","Wei Ai","Tao Meng","Keqin Li"],"pdf_url":"https://arxiv.org/pdf/2312.01758v2.pdf","comment":"14 pages, 14 figures, 3 tables"},{"id":"http://arxiv.org/abs/2403.11672v3","updated":"2024-07-01T12:56:40Z","published":"2024-03-18T11:20:11Z","title":"WIA-LD2ND: Wavelet-based Image Alignment for Self-supervised Low-Dose CT\n Denoising","summary":" In clinical examinations and diagnoses, low-dose computed tomography (LDCT)\nis crucial for minimizing health risks compared with normal-dose computed\ntomography (NDCT). However, reducing the radiation dose compromises the\nsignal-to-noise ratio, leading to degraded quality of CT images. To address\nthis, we analyze LDCT denoising task based on experimental results from the\nfrequency perspective, and then introduce a novel self-supervised CT image\ndenoising method called WIA-LD2ND, only using NDCT data. The proposed WIA-LD2ND\ncomprises two modules: Wavelet-based Image Alignment (WIA) and Frequency-Aware\nMulti-scale Loss (FAM). First, WIA is introduced to align NDCT with LDCT by\nmainly adding noise to the high-frequency components, which is the main\ndifference between LDCT and NDCT. Second, to better capture high-frequency\ncomponents and detailed information, Frequency-Aware Multi-scale Loss (FAM) is\nproposed by effectively utilizing multi-scale feature space. Extensive\nexperiments on two public LDCT denoising datasets demonstrate that our\nWIA-LD2ND, only uses NDCT, outperforms existing several state-of-the-art\nweakly-supervised and self-supervised methods. Source code is available at\nhttps://github.com/zhaohaoyu376/WI-LD2ND.\n","authors":["Haoyu Zhao","Yuliang Gu","Zhou Zhao","Bo Du","Yongchao Xu","Rui Yu"],"pdf_url":"https://arxiv.org/pdf/2403.11672v3.pdf","comment":"MICCAI2024"},{"id":"http://arxiv.org/abs/2403.11689v3","updated":"2024-07-01T12:52:58Z","published":"2024-03-18T11:38:47Z","title":"MoreStyle: Relax Low-frequency Constraint of Fourier-based Image\n Reconstruction in Generalizable Medical Image Segmentation","summary":" The task of single-source domain generalization (SDG) in medical image\nsegmentation is crucial due to frequent domain shifts in clinical image\ndatasets. To address the challenge of poor generalization across different\ndomains, we introduce a Plug-and-Play module for data augmentation called\nMoreStyle. MoreStyle diversifies image styles by relaxing low-frequency\nconstraints in Fourier space, guiding the image reconstruction network. With\nthe help of adversarial learning, MoreStyle further expands the style range and\npinpoints the most intricate style combinations within latent features. To\nhandle significant style variations, we introduce an uncertainty-weighted loss.\nThis loss emphasizes hard-to-classify pixels resulting only from style shifts\nwhile mitigating true hard-to-classify pixels in both MoreStyle-generated and\noriginal images. Extensive experiments on two widely used benchmarks\ndemonstrate that the proposed MoreStyle effectively helps to achieve good\ndomain generalization ability, and has the potential to further boost the\nperformance of some state-of-the-art SDG methods. Source code is available at\nhttps://github.com/zhaohaoyu376/morestyle.\n","authors":["Haoyu Zhao","Wenhui Dong","Rui Yu","Zhou Zhao","Du Bo","Yongchao Xu"],"pdf_url":"https://arxiv.org/pdf/2403.11689v3.pdf","comment":"MICCAI2024"},{"id":"http://arxiv.org/abs/2402.10208v2","updated":"2024-07-01T12:48:51Z","published":"2024-02-15T18:59:02Z","title":"Recovering the Pre-Fine-Tuning Weights of Generative Models","summary":" The dominant paradigm in generative modeling consists of two steps: i)\npre-training on a large-scale but unsafe dataset, ii) aligning the pre-trained\nmodel with human values via fine-tuning. This practice is considered safe, as\nno current method can recover the unsafe, pre-fine-tuning model weights. In\nthis paper, we demonstrate that this assumption is often false. Concretely, we\npresent Spectral DeTuning, a method that can recover the weights of the\npre-fine-tuning model using a few low-rank (LoRA) fine-tuned models. In\ncontrast to previous attacks that attempt to recover pre-fine-tuning\ncapabilities, our method aims to recover the exact pre-fine-tuning weights. Our\napproach exploits this new vulnerability against large-scale models such as a\npersonalized Stable Diffusion and an aligned Mistral.\n","authors":["Eliahu Horwitz","Jonathan Kahana","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2402.10208v2.pdf","comment":"ICML 2024. Project page: https://vision.huji.ac.il/spectral_detuning/"},{"id":"http://arxiv.org/abs/2405.02179v3","updated":"2024-07-01T12:25:45Z","published":"2024-05-03T15:27:11Z","title":"Training-Free Deepfake Voice Recognition by Leveraging Large-Scale\n Pre-Trained Models","summary":" Generalization is a main issue for current audio deepfake detectors, which\nstruggle to provide reliable results on out-of-distribution data. Given the\nspeed at which more and more accurate synthesis methods are developed, it is\nvery important to design techniques that work well also on data they were not\ntrained for. In this paper we study the potential of large-scale pre-trained\nmodels for audio deepfake detection, with special focus on generalization\nability. To this end, the detection problem is reformulated in a speaker\nverification framework and fake audios are exposed by the mismatch between the\nvoice sample under test and the voice of the claimed identity. With this\nparadigm, no fake speech sample is necessary in training, cutting off any link\nwith the generation method at the root, and ensuring full generalization\nability. Features are extracted by general-purpose large pre-trained models,\nwith no need for training or fine-tuning on specific fake detection or speaker\nverification datasets. At detection time only a limited set of voice fragments\nof the identity under test is required. Experiments on several datasets\nwidespread in the community show that detectors based on pre-trained models\nachieve excellent performance and show strong generalization ability, rivaling\nsupervised methods on in-distribution data and largely overcoming them on\nout-of-distribution data.\n","authors":["Alessandro Pianese","Davide Cozzolino","Giovanni Poggi","Luisa Verdoliva"],"pdf_url":"https://arxiv.org/pdf/2405.02179v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08037v2","updated":"2024-07-01T12:03:55Z","published":"2024-06-12T09:39:18Z","title":"Adaptively Bypassing Vision Transformer Blocks for Efficient Visual\n Tracking","summary":" Empowered by transformer-based models, visual tracking has advanced\nsignificantly. However, the slow speed of current trackers limits their\napplicability on devices with constrained computational resources. To address\nthis challenge, we introduce ABTrack, an adaptive computation framework that\nadaptively bypassing transformer blocks for efficient visual tracking. The\nrationale behind ABTrack is rooted in the observation that semantic features or\nrelations do not uniformly impact the tracking task across all abstraction\nlevels. Instead, this impact varies based on the characteristics of the target\nand the scene it occupies. Consequently, disregarding insignificant semantic\nfeatures or relations at certain abstraction levels may not significantly\naffect the tracking accuracy. We propose a Bypass Decision Module (BDM) to\ndetermine if a transformer block should be bypassed, which adaptively\nsimplifies the architecture of ViTs and thus speeds up the inference process.\nTo counteract the time cost incurred by the BDMs and further enhance the\nefficiency of ViTs, we introduce a novel ViT pruning method to reduce the\ndimension of the latent representation of tokens in each transformer block.\nExtensive experiments on multiple tracking benchmarks validate the\neffectiveness and generality of the proposed method and show that it achieves\nstate-of-the-art performance. Code is released at:\nhttps://github.com/xyyang317/ABTrack.\n","authors":["Xiangyang Yang","Dan Zeng","Xucheng Wang","You Wu","Hengzhou Ye","Qijun Zhao","Shuiwang Li"],"pdf_url":"https://arxiv.org/pdf/2406.08037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13113v3","updated":"2024-07-01T11:57:06Z","published":"2023-03-23T09:00:38Z","title":"AdaCL:Adaptive Continual Learning","summary":" Class-Incremental Learning aims to update a deep classifier to learn new\ncategories while maintaining or improving its accuracy on previously observed\nclasses. Common methods to prevent forgetting previously learned classes\ninclude regularizing the neural network updates and storing exemplars in\nmemory, which come with hyperparameters such as the learning rate,\nregularization strength, or the number of exemplars. However, these\nhyperparameters are usually only tuned at the start and then kept fixed\nthroughout the learning sessions, ignoring the fact that newly encountered\ntasks may have varying levels of novelty or difficulty. This study investigates\nthe necessity of hyperparameter `adaptivity' in Class-Incremental Learning: the\nability to dynamically adjust hyperparameters such as the learning rate,\nregularization strength, and memory size according to the properties of the new\ntask at hand. We propose AdaCL, a Bayesian Optimization-based approach to\nautomatically and efficiently determine the optimal values for those parameters\nwith each learning task. We show that adapting hyperpararmeters on each new\ntask leads to improvement in accuracy, forgetting and memory. Code is available\nat https://github.com/ElifCerenGokYildirim/AdaCL.\n","authors":["Elif Ceren Gok Yildirim","Murat Onur Yildirim","Mert Kilickaya","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2303.13113v3.pdf","comment":"Published in 1st ContinualAI Unconference"},{"id":"http://arxiv.org/abs/2406.19398v2","updated":"2024-07-01T11:38:19Z","published":"2024-05-04T14:28:09Z","title":"Woven Fabric Capture with a Reflection-Transmission Photo Pair","summary":" Digitizing woven fabrics would be valuable for many applications, from\ndigital humans to interior design. Previous work introduces a lightweight woven\nfabric acquisition approach by capturing a single reflection image and\nestimating the fabric parameters with a differentiable geometric and shading\nmodel. The renderings of the estimated fabric parameters can closely match the\nphoto; however, the captured reflection image is insufficient to fully\ncharacterize the fabric sample reflectance. For instance, fabrics with\ndifferent thicknesses might have similar reflection images but lead to\nsignificantly different transmission. We propose to recover the woven fabric\nparameters from two captured images: reflection and transmission. At the core\nof our method is a differentiable bidirectional scattering distribution\nfunction (BSDF) model, handling reflection and transmission, including single\nand multiple scattering. We propose a two-layer model, where the single\nscattering uses an SGGX phase function as in previous work, and multiple\nscattering uses a new azimuthally-invariant microflake definition, which we\nterm ASGGX. This new fabric BSDF model closely matches real woven fabrics in\nboth reflection and transmission. We use a simple setup for capturing\nreflection and transmission photos with a cell phone camera and two point\nlights, and estimate the fabric parameters via a lightweight network, together\nwith a differentiable optimization. We also model the out-of-focus effects\nexplicitly with a simple solution to match the thin-lens camera better. As a\nresult, the renderings of the estimated parameters can agree with the input\nimages on both reflection and transmission for the first time. The code for\nthis paper is at https://github.com/lxtyin/FabricBTDF-Recovery.\n","authors":["Yingjie Tang","Zixuan Li","Miloš Hašan","Jian Yang","Beibei Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19398v2.pdf","comment":"10 pages, 16 figures (in the main paper). Accepted by SIGGRAPH 2024\n conference"},{"id":"http://arxiv.org/abs/2405.05553v3","updated":"2024-07-01T10:27:42Z","published":"2024-05-09T05:23:34Z","title":"Towards Robust Physical-world Backdoor Attacks on Lane Detection","summary":" Deep learning-based lane detection (LD) plays a critical role in autonomous\ndriving systems, such as adaptive cruise control. However, it is vulnerable to\nbackdoor attacks. Existing backdoor attack methods on LD exhibit limited\neffectiveness in dynamic real-world scenarios, primarily because they fail to\nconsider dynamic scene factors, including changes in driving perspectives\n(e.g., viewpoint transformations) and environmental conditions (e.g., weather\nor lighting changes). To tackle this issue, this paper introduces BadLANE, a\ndynamic scene adaptation backdoor attack for LD designed to withstand changes\nin real-world dynamic scene factors. To address the challenges posed by\nchanging driving perspectives, we propose an amorphous trigger pattern composed\nof shapeless pixels. This trigger design allows the backdoor to be activated by\nvarious forms or shapes of mud spots or pollution on the road or lens, enabling\nadaptation to changes in vehicle observation viewpoints during driving. To\nmitigate the effects of environmental changes, we design a meta-learning\nframework to train meta-generators tailored to different environmental\nconditions. These generators produce meta-triggers that incorporate diverse\nenvironmental information, such as weather or lighting conditions, as the\ninitialization of the trigger patterns for backdoor implantation, thus enabling\nadaptation to dynamic environments. Extensive experiments on various commonly\nused LD models in both digital and physical domains validate the effectiveness\nof our attacks, outperforming other baselines significantly (+25.15% on average\nin Attack Success Rate). Our codes will be available upon paper publication.\n","authors":["Xinwei Zhang","Aishan Liu","Tianyuan Zhang","Siyuan Liang","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.05553v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02331v2","updated":"2024-07-01T10:16:38Z","published":"2023-03-04T05:34:25Z","title":"Training-Free Acceleration of ViTs with Delayed Spatial Merging","summary":" Token merging has emerged as a new paradigm that can accelerate the inference\nof Vision Transformers (ViTs) without any retraining or fine-tuning. To push\nthe frontier of training-free acceleration in ViTs, we improve token merging by\nadding the perspectives of 1) activation outliers and 2) hierarchical\nrepresentations. Through a careful analysis of the attention behavior in ViTs,\nwe characterize a delayed onset of the convergent attention phenomenon, which\nmakes token merging undesirable in the bottom blocks of ViTs. Moreover, we\naugment token merging with a hierarchical processing scheme to capture\nmulti-scale redundancy between visual tokens. Combining these two insights, we\nbuild a unified inference framework called DSM: Delayed Spatial Merging. We\nextensively evaluate DSM on various ViT model scales (Tiny to Huge) and tasks\n(ImageNet-1k and transfer learning), achieving up to 1.8$\\times$ FLOP reduction\nand 1.6$\\times$ throughput speedup at a negligible loss while being two orders\nof magnitude faster than existing methods.\n","authors":["Jung Hwan Heo","Seyedarmin Azizi","Arash Fayyazi","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2303.02331v2.pdf","comment":"ICML 2024 ES-FoMo Workshop"},{"id":"http://arxiv.org/abs/2406.11650v2","updated":"2024-07-01T09:57:32Z","published":"2024-06-17T15:31:54Z","title":"Multimodal Learning With Intraoperative CBCT & Variably Aligned\n Preoperative CT Data To Improve Segmentation","summary":" Cone-beam computed tomography (CBCT) is an important tool facilitating\ncomputer aided interventions, despite often suffering from artifacts that pose\nchallenges for accurate interpretation. While the degraded image quality can\naffect downstream segmentation, the availability of high quality, preoperative\nscans represents potential for improvements. Here we consider a setting where\npreoperative CT and intraoperative CBCT scans are available, however, the\nalignment (registration) between the scans is imperfect. We propose a\nmultimodal learning method that fuses roughly aligned CBCT and CT scans and\ninvestigate the effect of CBCT quality and misalignment on the final\nsegmentation performance. For that purpose, we make use of a synthetically\ngenerated data set containing real CT and synthetic CBCT volumes. As an\napplication scenario, we focus on liver and liver tumor segmentation. We show\nthat the fusion of preoperative CT and simulated, intraoperative CBCT mostly\nimproves segmentation performance (compared to using intraoperative CBCT only)\nand that even clearly misaligned preoperative data has the potential to improve\nsegmentation performance.\n","authors":["Maximilian E. Tschuchnig","Philipp Steininger","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2406.11650v2.pdf","comment":"Submitted to SASHIMI2024 (MICCAI workshop)"},{"id":"http://arxiv.org/abs/2406.16189v2","updated":"2024-07-01T09:34:22Z","published":"2024-06-23T18:47:51Z","title":"Fuzzy Attention-based Border Rendering Network for Lung Organ\n Segmentation","summary":" Automatic lung organ segmentation on CT images is crucial for lung disease\ndiagnosis. However, the unlimited voxel values and class imbalance of lung\norgans can lead to false-negative/positive and leakage issues in advanced\nmethods. Additionally, some slender lung organs are easily lost during the\nrecycled down/up-sample procedure, e.g., bronchioles & arterioles, causing\nsevere discontinuity issue. Inspired by these, this paper introduces an\neffective lung organ segmentation method called Fuzzy Attention-based Border\nRendering (FABR) network. Since fuzzy logic can handle the uncertainty in\nfeature extraction, hence the fusion of deep networks and fuzzy sets should be\na viable solution for better performance. Meanwhile, unlike prior top-tier\nmethods that operate on all regular dense points, our FABR depicts lung organ\nregions as cube-trees, focusing only on recycle-sampled border vulnerable\npoints, rendering the severely discontinuous, false-negative/positive organ\nregions with a novel Global-Local Cube-tree Fusion (GLCF) module. All\nexperimental results, on four challenging datasets of airway & artery,\ndemonstrate that our method can achieve the favorable performance\nsignificantly.\n","authors":["Sheng Zhang","Yang Nan","Yingying Fang","Shiyi Wang","Xiaodan Xing","Zhifan Gao","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2406.16189v2.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2310.04671v4","updated":"2024-07-01T09:29:39Z","published":"2023-10-07T03:16:30Z","title":"Exploring the Potential of Multi-Modal AI for Driving Hazard Prediction","summary":" This paper addresses the problem of predicting hazards that drivers may\nencounter while driving a car. We formulate it as a task of anticipating\nimpending accidents using a single input image captured by car dashcams. Unlike\nexisting approaches to driving hazard prediction that rely on computational\nsimulations or anomaly detection from videos, this study focuses on high-level\ninference from static images. The problem needs predicting and reasoning about\nfuture events based on uncertain observations, which falls under visual\nabductive reasoning. To enable research in this understudied area, a new\ndataset named the DHPR (Driving Hazard Prediction and Reasoning) dataset is\ncreated. The dataset consists of 15K dashcam images of street scenes, and each\nimage is associated with a tuple containing car speed, a hypothesized hazard\ndescription, and visual entities present in the scene. These are annotated by\nhuman annotators, who identify risky scenes and provide descriptions of\npotential accidents that could occur a few seconds later. We present several\nbaseline methods and evaluate their performance on our dataset, identifying\nremaining issues and discussing future directions. This study contributes to\nthe field by introducing a novel problem formulation and dataset, enabling\nresearchers to explore the potential of multi-modal AI for driving hazard\nprediction.\n","authors":["Korawat Charoenpitaks","Van-Quang Nguyen","Masanori Suganuma","Masahiro Takahashi","Ryoma Niihara","Takayuki Okatani"],"pdf_url":"https://arxiv.org/pdf/2310.04671v4.pdf","comment":"Main Paper: 11 pages, Supplementary Materials: 25 pages"},{"id":"http://arxiv.org/abs/2406.02884v2","updated":"2024-07-01T09:05:58Z","published":"2024-06-05T03:05:52Z","title":"PosterLLaVa: Constructing a Unified Multi-modal Layout Generator with\n LLM","summary":" Layout generation is the keystone in achieving automated graphic design,\nrequiring arranging the position and size of various multi-modal design\nelements in a visually pleasing and constraint-following manner. Previous\napproaches are either inefficient for large-scale applications or lack\nflexibility for varying design requirements. Our research introduces a unified\nframework for automated graphic layout generation, leveraging the multi-modal\nlarge language model (MLLM) to accommodate diverse design tasks. In contrast,\nour data-driven method employs structured text (JSON format) and visual\ninstruction tuning to generate layouts under specific visual and textual\nconstraints, including user-defined natural language specifications. We\nconducted extensive experiments and achieved state-of-the-art (SOTA)\nperformance on public multi-modal layout generation benchmarks, demonstrating\nthe effectiveness of our method. Moreover, recognizing existing datasets'\nlimitations in capturing the complexity of real-world graphic designs, we\npropose two new datasets for much more challenging tasks (user-constrained\ngeneration and complicated poster), further validating our model's utility in\nreal-life settings. Marking by its superior accessibility and adaptability,\nthis approach further automates large-scale graphic design tasks. The code and\ndatasets will be publicly available on\nhttps://github.com/posterllava/PosterLLaVA.\n","authors":["Tao Yang","Yingmin Luo","Zhongang Qi","Yang Wu","Ying Shan","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2406.02884v2.pdf","comment":"10 pages; typos corrected, appendix added"},{"id":"http://arxiv.org/abs/2403.11370v3","updated":"2024-07-01T09:04:31Z","published":"2024-03-17T23:23:40Z","title":"DynamicGlue: Epipolar and Time-Informed Data Association in Dynamic\n Environments using Graph Neural Networks","summary":" The assumption of a static environment is common in many geometric computer\nvision tasks like SLAM but limits their applicability in highly dynamic scenes.\nSince these tasks rely on identifying point correspondences between input\nimages within the static part of the environment, we propose a graph neural\nnetwork-based sparse feature matching network designed to perform robust\nmatching under challenging conditions while excluding keypoints on moving\nobjects. We employ a similar scheme of attentional aggregation over graph edges\nto enhance keypoint representations as state-of-the-art feature-matching\nnetworks but augment the graph with epipolar and temporal information and\nvastly reduce the number of graph edges. Furthermore, we introduce a\nself-supervised training scheme to extract pseudo labels for image pairs in\ndynamic environments from exclusively unprocessed visual-inertial data. A\nseries of experiments show the superior performance of our network as it\nexcludes keypoints on moving objects compared to state-of-the-art feature\nmatching networks while still achieving similar results regarding conventional\nmatching metrics. When integrated into a SLAM system, our network significantly\nimproves performance, especially in highly dynamic scenes.\n","authors":["Theresa Huber","Simon Schaefer","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2403.11370v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14250v3","updated":"2024-07-01T08:37:14Z","published":"2024-06-20T12:22:05Z","title":"E-ANT: A Large-Scale Dataset for Efficient Automatic GUI NavigaTion","summary":" Online GUI navigation on mobile devices has driven a lot of attention recent\nyears since it contributes to many real-world applications. With the rapid\ndevelopment of large language models (LLM), multimodal large language models\n(MLLM) have tremendous potential on this task. However, existing MLLMs need\nhigh quality data to improve its abilities of making the correct navigation\ndecisions according to the human user inputs. In this paper, we developed a\nnovel and highly valuable dataset, named \\textbf{E-ANT}, as the first Chinese\nGUI navigation dataset that contains real human behaviour and high quality\nscreenshots with annotations, containing nearly 40,000 real human traces over\n5000+ different tinyAPPs. Furthermore, we evaluate various powerful MLLMs on\nE-ANT and show their experiments results with sufficient ablations. We believe\nthat our proposed dataset will be beneficial for both the evaluation and\ndevelopment of GUI navigation and LLM/MLLM decision-making capabilities.\n","authors":["Ke Wang","Tianyu Xia","Zhangxuan Gu","Yi Zhao","Shuheng Shen","Changhua Meng","Weiqiang Wang","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2406.14250v3.pdf","comment":"9 pages, 5 figures, Under review"},{"id":"http://arxiv.org/abs/2406.18176v2","updated":"2024-07-01T07:59:13Z","published":"2024-06-26T08:50:51Z","title":"VIPriors 4: Visual Inductive Priors for Data-Efficient Deep Learning\n Challenges","summary":" The fourth edition of the \"VIPriors: Visual Inductive Priors for\nData-Efficient Deep Learning\" workshop features two data-impaired challenges.\nThese challenges address the problem of training deep learning models for\ncomputer vision tasks with limited data. Participants are limited to training\nmodels from scratch using a low number of training samples and are not allowed\nto use any form of transfer learning. We aim to stimulate the development of\nnovel approaches that incorporate inductive biases to improve the data\nefficiency of deep learning models. Significant advancements are made compared\nto the provided baselines, where winning solutions surpass the baselines by a\nconsiderable margin in both tasks. As in previous editions, these achievements\nare primarily attributed to heavy use of data augmentation policies and large\nmodel ensembles, though novel prior-based methods seem to contribute more to\nsuccessful solutions compared to last year. This report highlights the key\naspects of the challenges and their outcomes.\n","authors":["Robert-Jan Bruintjes","Attila Lengyel","Marcos Baptista Rios","Osman Semih Kayhan","Davide Zambrano","Nergis Tomen","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2406.18176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12975v2","updated":"2024-07-01T07:40:03Z","published":"2024-02-05T12:11:15Z","title":"Training morphological neural networks with gradient descent: some\n theoretical insights","summary":" Morphological neural networks, or layers, can be a powerful tool to boost the\nprogress in mathematical morphology, either on theoretical aspects such as the\nrepresentation of complete lattice operators, or in the development of image\nprocessing pipelines. However, these architectures turn out to be difficult to\ntrain when they count more than a few morphological layers, at least within\npopular machine learning frameworks which use gradient descent based\noptimization algorithms. In this paper we investigate the potential and\nlimitations of differentiation based approaches and back-propagation applied to\nmorphological networks, in light of the non-smooth optimization concept of\nBouligand derivative. We provide insights and first theoretical guidelines, in\nparticular regarding initialization and learning rates.\n","authors":["Samy Blusseau"],"pdf_url":"https://arxiv.org/pdf/2403.12975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19407v2","updated":"2024-07-01T07:04:51Z","published":"2024-06-12T06:41:23Z","title":"YOLOv10 to Its Genesis: A Decadal and Comprehensive Review of The You\n Only Look Once Series","summary":" This review systematically examines the progression of the You Only Look Once\n(YOLO) object detection algorithms from YOLOv1 to the recently unveiled\nYOLOv10. Employing a reverse chronological analysis, this study examines the\nadvancements introduced by YOLO algorithms, beginning with YOLOv10 and\nprogressing through YOLOv9, YOLOv8, and subsequent versions to explore each\nversion's contributions to enhancing speed, accuracy, and computational\nefficiency in real-time object detection. The study highlights the\ntransformative impact of YOLO across five critical application areas:\nautomotive safety, healthcare, industrial manufacturing, surveillance, and\nagriculture. By detailing the incremental technological advancements in\nsubsequent YOLO versions, this review chronicles the evolution of YOLO, and\ndiscusses the challenges and limitations in each earlier versions. The\nevolution signifies a path towards integrating YOLO with multimodal,\ncontext-aware, and General Artificial Intelligence (AGI) systems for the next\nYOLO decade, promising significant implications for future developments in\nAI-driven applications.\n","authors":["Ranjan Sapkota","Rizwan Qureshi","Marco Flores Calero","Chetan Badjugar","Upesh Nepal","Alwin Poulose","Peter Zeno","Uday Bhanu Prakash Vaddevolu","Hong Yan","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2406.19407v2.pdf","comment":"11 Figures, 7 Tables"},{"id":"http://arxiv.org/abs/2406.16085v2","updated":"2024-07-01T06:33:12Z","published":"2024-06-23T11:57:08Z","title":"A Simple Framework for Open-Vocabulary Zero-Shot Segmentation","summary":" Zero-shot classification capabilities naturally arise in models trained\nwithin a vision-language contrastive framework. Despite their classification\nprowess, these models struggle in dense tasks like zero-shot open-vocabulary\nsegmentation. This deficiency is often attributed to the absence of\nlocalization cues in captions and the intertwined nature of the learning\nprocess, which encompasses both image representation learning and\ncross-modality alignment. To tackle these issues, we propose SimZSS, a Simple\nframework for open-vocabulary Zero-Shot Segmentation. The method is founded on\ntwo key principles: i) leveraging frozen vision-only models that exhibit\nspatial awareness while exclusively aligning the text encoder and ii)\nexploiting the discrete nature of text and linguistic knowledge to pinpoint\nlocal concepts within captions. By capitalizing on the quality of the visual\nrepresentations, our method requires only image-caption pairs datasets and\nadapts to both small curated and large-scale noisy datasets. When trained on\nCOCO Captions across 8 GPUs, SimZSS achieves state-of-the-art results on 7 out\nof 8 benchmark datasets in less than 15 minutes.\n","authors":["Thomas Stegmüller","Tim Lebailly","Nikola Dukic","Behzad Bozorgtabar","Tinne Tuytelaars","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2406.16085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13382v2","updated":"2024-07-01T06:14:04Z","published":"2024-05-22T06:31:42Z","title":"VTG-LLM: Integrating Timestamp Knowledge into Video LLMs for Enhanced\n Video Temporal Grounding","summary":" Video Temporal Grounding (VTG) focuses on accurately identifying event\ntimestamps within a particular video based on a linguistic query, playing a\nvital role in downstream tasks such as video browsing and editing. While Video\nLarge Language Models (video LLMs) have made significant progress in\nunderstanding video content, they often face challenges in accurately\npinpointing timestamps within videos, which limits their performance on VTG\ntasks. Therefore, to improve video LLMs' ability to effectively locate\ntimestamps, we argue that two critical aspects need to be enhanced. First, it\nis essential to have high-quality instructional tuning datasets that encompass\nmainstream VTG tasks. Second, directly incorporating timestamp knowledge into\nvideo LLMs is crucial, as it enables models to efficiently comprehend timestamp\ninformation. To address these needs, we first introduce VTG-IT-120K, a\nhigh-quality and comprehensive instruction tuning dataset that covers VTG tasks\nsuch as moment retrieval, dense video captioning, video summarization, and\nvideo highlight detection. Furthermore, we propose a specially designed video\nLLM model for VTG tasks, VTG-LLM, which (1) effectively integrates timestamp\nknowledge into visual tokens; (2) incorporates absolute-time tokens that\nspecifically handle timestamp knowledge, thereby avoiding concept shifts; and\n(3) introduces a lightweight, high-performance slot-based token compression\nmethod to facilitate the sampling of more video frames. Comprehensive\nexperiments showcase the superior performance of VTG-LLM in comparison to other\nvideo LLM methods across various VTG tasks. Our code and datasets are available\nat \\url{https://github.com/gyxxyg/VTG-LLM}.\n","authors":["Yongxin Guo","Jingyu Liu","Mingda Li","Xiaoying Tang","Xi Chen","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.13382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10356v4","updated":"2024-07-01T06:01:29Z","published":"2023-09-19T06:32:19Z","title":"RoadFormer: Duplex Transformer for RGB-Normal Semantic Road Scene\n Parsing","summary":" The recent advancements in deep convolutional neural networks have shown\nsignificant promise in the domain of road scene parsing. Nevertheless, the\nexisting works focus primarily on freespace detection, with little attention\ngiven to hazardous road defects that could compromise both driving safety and\ncomfort. In this paper, we introduce RoadFormer, a novel Transformer-based\ndata-fusion network developed for road scene parsing. RoadFormer utilizes a\nduplex encoder architecture to extract heterogeneous features from both RGB\nimages and surface normal information. The encoded features are subsequently\nfed into a novel heterogeneous feature synergy block for effective feature\nfusion and recalibration. The pixel decoder then learns multi-scale long-range\ndependencies from the fused and recalibrated heterogeneous features, which are\nsubsequently processed by a Transformer decoder to produce the final semantic\nprediction. Additionally, we release SYN-UDTIRI, the first large-scale road\nscene parsing dataset that contains over 10,407 RGB images, dense depth images,\nand the corresponding pixel-level annotations for both freespace and road\ndefects of different shapes and sizes. Extensive experimental evaluations\nconducted on our SYN-UDTIRI dataset, as well as on three public datasets,\nincluding KITTI road, CityScapes, and ORFD, demonstrate that RoadFormer\noutperforms all other state-of-the-art networks for road scene parsing.\nSpecifically, RoadFormer ranks first on the KITTI road benchmark. Our source\ncode, created dataset, and demo video are publicly available at\nmias.group/RoadFormer.\n","authors":["Jiahang Li","Yikang Zhang","Peng Yun","Guangliang Zhou","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2309.10356v4.pdf","comment":"10 pages 7 figures. Accepted by Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2303.11726v4","updated":"2024-07-01T05:20:37Z","published":"2023-03-21T10:30:43Z","title":"3D Human Mesh Estimation from Virtual Markers","summary":" Inspired by the success of volumetric 3D pose estimation, some recent human\nmesh estimators propose to estimate 3D skeletons as intermediate\nrepresentations, from which, the dense 3D meshes are regressed by exploiting\nthe mesh topology. However, body shape information is lost in extracting\nskeletons, leading to mediocre performance. The advanced motion capture systems\nsolve the problem by placing dense physical markers on the body surface, which\nallows to extract realistic meshes from their non-rigid motions. However, they\ncannot be applied to wild images without markers. In this work, we present an\nintermediate representation, named virtual markers, which learns 64 landmark\nkeypoints on the body surface based on the large-scale mocap data in a\ngenerative style, mimicking the effects of physical markers. The virtual\nmarkers can be accurately detected from wild images and can reconstruct the\nintact meshes with realistic shapes by simple interpolation. Our approach\noutperforms the state-of-the-art methods on three datasets. In particular, it\nsurpasses the existing methods by a notable margin on the SURREAL dataset,\nwhich has diverse body shapes. Code is available at\nhttps://github.com/ShirleyMaxx/VirtualMarker\n","authors":["Xiaoxuan Ma","Jiajun Su","Chunyu Wang","Wentao Zhu","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2303.11726v4.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2404.19379v3","updated":"2024-07-01T04:51:21Z","published":"2024-04-30T09:11:04Z","title":"SemanticFormer: Holistic and Semantic Traffic Scene Representation for\n Trajectory Prediction using Knowledge Graphs","summary":" Trajectory prediction in autonomous driving relies on accurate representation\nof all relevant contexts of the driving scene, including traffic participants,\nroad topology, traffic signs, as well as their semantic relations to each\nother. Despite increased attention to this issue, most approaches in trajectory\nprediction do not consider all of these factors sufficiently. We present\nSemanticFormer, an approach for predicting multimodal trajectories by reasoning\nover a semantic traffic scene graph using a hybrid approach. It utilizes\nhigh-level information in the form of meta-paths, i.e. trajectories on which an\nagent is allowed to drive from a knowledge graph which is then processed by a\nnovel pipeline based on multiple attention mechanisms to predict accurate\ntrajectories. SemanticFormer comprises a hierarchical heterogeneous graph\nencoder to capture spatio-temporal and relational information across agents as\nwell as between agents and road elements. Further, it includes a predictor to\nfuse different encodings and decode trajectories with probabilities. Finally, a\nrefinement module assesses permitted meta-paths of trajectories and speed\nprofiles to obtain final predicted trajectories. Evaluation of the nuScenes\nbenchmark demonstrates improved performance compared to several SOTA methods.\nIn addition, we demonstrate that our knowledge graph can be easily added to two\ngraph-based existing SOTA methods, namely VectorNet and Laformer, replacing\ntheir original homogeneous graphs. The evaluation results suggest that by\nadding our knowledge graph the performance of the original methods is enhanced\nby 5% and 4%, respectively.\n","authors":["Zhigang Sun","Zixu Wang","Lavdim Halilaj","Juergen Luettin"],"pdf_url":"https://arxiv.org/pdf/2404.19379v3.pdf","comment":"8 pages, 7 figures, has been accepted for publication in the IEEE\n Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2406.03017v3","updated":"2024-07-01T04:36:08Z","published":"2024-06-05T07:32:29Z","title":"DifAttack++: Query-Efficient Black-Box Adversarial Attack via\n Hierarchical Disentangled Feature Space in Cross-Domain","summary":" This work investigates efficient score-based black-box adversarial attacks\nwith a high Attack Success Rate (\\textbf{ASR}) and good generalizability. We\ndesign a novel attack method based on a hierarchical DIsentangled Feature\nspace, called \\textbf{DifAttack++}, which differs significantly from the\nexisting ones operating over the entire feature space. Specifically,\nDifAttack++ firstly disentangles an image's latent feature into an Adversarial\nFeature (\\textbf{AF}) and a Visual Feature (\\textbf{VF}) via an autoencoder\nequipped with our specially designed Hierarchical Decouple-Fusion\n(\\textbf{HDF}) module, where the AF dominates the adversarial capability of an\nimage, while the VF largely determines its visual appearance. We train such two\nautoencoders for the clean and adversarial image domains (i.e., cross-domain)\nrespectively to achieve image reconstructions and feature disentanglement, by\nusing pairs of clean images and their Adversarial Examples (\\textbf{AE}s)\ngenerated from available surrogate models via white-box attack methods.\nEventually, in the black-box attack stage, DifAttack++ iteratively optimizes\nthe AF according to the query feedback from the victim model until a successful\nAE is generated, while keeping the VF unaltered. Extensive experimental results\ndemonstrate that our DifAttack++ leads to superior ASR and query efficiency\nthan state-of-the-art methods, meanwhile exhibiting much better visual quality\nof AEs. The code is available at https://github.com/csjunjun/DifAttack.git.\n","authors":["Jun Liu","Jiantao Zhou","Jiandian Zeng","Jinyu Tian","Zheng Li"],"pdf_url":"https://arxiv.org/pdf/2406.03017v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2309.14585 An\n extension of the AAAI24 paper \"DifAttack: Query-Efficient Black-Box Attack\n via Disentangled Feature Space.\""},{"id":"http://arxiv.org/abs/2402.04324v2","updated":"2024-07-01T03:57:55Z","published":"2024-02-06T19:08:18Z","title":"ConsistI2V: Enhancing Visual Consistency for Image-to-Video Generation","summary":" Image-to-video (I2V) generation aims to use the initial frame (alongside a\ntext prompt) to create a video sequence. A grand challenge in I2V generation is\nto maintain visual consistency throughout the video: existing methods often\nstruggle to preserve the integrity of the subject, background, and style from\nthe first frame, as well as ensure a fluid and logical progression within the\nvideo narrative. To mitigate these issues, we propose ConsistI2V, a\ndiffusion-based method to enhance visual consistency for I2V generation.\nSpecifically, we introduce (1) spatiotemporal attention over the first frame to\nmaintain spatial and motion consistency, (2) noise initialization from the\nlow-frequency band of the first frame to enhance layout consistency. These two\napproaches enable ConsistI2V to generate highly consistent videos. We also\nextend the proposed approaches to show their potential to improve consistency\nin auto-regressive long video generation and camera motion control. To verify\nthe effectiveness of our method, we propose I2V-Bench, a comprehensive\nevaluation benchmark for I2V generation. Our automatic and human evaluation\nresults demonstrate the superiority of ConsistI2V over existing methods.\n","authors":["Weiming Ren","Huan Yang","Ge Zhang","Cong Wei","Xinrun Du","Wenhao Huang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04324v2.pdf","comment":"Project Page: https://tiger-ai-lab.github.io/ConsistI2V/"},{"id":"http://arxiv.org/abs/2308.13201v2","updated":"2024-07-01T03:17:51Z","published":"2023-08-25T06:45:02Z","title":"Deep Active Audio Feature Learning in Resource-Constrained Environments","summary":" The scarcity of labelled data makes training Deep Neural Network (DNN) models\nin bioacoustic applications challenging. In typical bioacoustics applications,\nmanually labelling the required amount of data can be prohibitively expensive.\nTo effectively identify both new and current classes, DNN models must continue\nto learn new features from a modest amount of fresh data. Active Learning (AL)\nis an approach that can help with this learning while requiring little\nlabelling effort. Nevertheless, the use of fixed feature extraction approaches\nlimits feature quality, resulting in underutilization of the benefits of AL. We\ndescribe an AL framework that addresses this issue by incorporating feature\nextraction into the AL loop and refining the feature extractor after each round\nof manual annotation. In addition, we use raw audio processing rather than\nspectrograms, which is a novel approach. Experiments reveal that the proposed\nAL framework requires 14.3%, 66.7%, and 47.4% less labelling effort on\nbenchmark audio datasets ESC-50, UrbanSound8k, and InsectWingBeat,\nrespectively, for a large DNN model and similar savings on a\nmicrocontroller-based counterpart. Furthermore, we showcase the practical\nrelevance of our study by incorporating data from conservation biology\nprojects. All codes are publicly available on GitHub.\n","authors":["Md Mohaimenuzzaman","Christoph Bergmeir","Bernd Meyer"],"pdf_url":"https://arxiv.org/pdf/2308.13201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09410v2","updated":"2024-07-01T03:10:43Z","published":"2024-06-13T17:59:51Z","title":"Scene Graph Generation in Large-Size VHR Satellite Imagery: A\n Large-Scale Dataset and A Context-Aware Approach","summary":" Scene graph generation (SGG) in satellite imagery (SAI) benefits promoting\nintelligent understanding of geospatial scenarios from perception to cognition.\nIn SAI, objects exhibit great variations in scales and aspect ratios, and there\nexist rich relationships between objects (even between spatially disjoint\nobjects), which makes it necessary to holistically conduct SGG in large-size\nvery-high-resolution (VHR) SAI. However, the lack of SGG datasets with\nlarge-size VHR SAI has constrained the advancement of SGG in SAI. Due to the\ncomplexity of large-size VHR SAI, mining triplets in large-size VHR SAI heavily relies on long-range contextual\nreasoning. Consequently, SGG models designed for small-size natural imagery are\nnot directly applicable to large-size VHR SAI. To address the scarcity of\ndatasets, this paper constructs a large-scale dataset for SGG in large-size VHR\nSAI with image sizes ranging from 512 x 768 to 27,860 x 31,096 pixels, named\nRSG, encompassing over 210,000 objects and more than 400,000 triplets. To\nrealize SGG in large-size VHR SAI, we propose a context-aware cascade cognition\n(CAC) framework to understand SAI at three levels: object detection (OBD), pair\npruning and relationship prediction. As a fundamental prerequisite for SGG in\nlarge-size SAI, a holistic multi-class object detection network (HOD-Net) that\ncan flexibly integrate multi-scale contexts is proposed. With the consideration\nthat there exist a huge amount of object pairs in large-size SAI but only a\nminority of object pairs contain meaningful relationships, we design a pair\nproposal generation (PPG) network via adversarial reconstruction to select\nhigh-value pairs. Furthermore, a relationship prediction network with\ncontext-aware messaging (RPCM) is proposed to predict the relationship types of\nthese pairs.\n","authors":["Yansheng Li","Linlin Wang","Tingzhu Wang","Xue Yang","Junwei Luo","Qi Wang","Youming Deng","Wenbin Wang","Xian Sun","Haifeng Li","Bo Dang","Yongjun Zhang","Yi Yu","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2406.09410v2.pdf","comment":"This paper releases a SAI-oriented SGG toolkit with about 30 OBD\n methods and 10 SGG methods, and develops a benchmark based on RSG where our\n HOD-Net and RPCM significantly outperform the state-of-the-art methods in\n both OBD and SGG tasks. The RSG dataset and SAI-oriented toolkit will be made\n publicly available at https://linlin-dev.github.io/project/RSG"},{"id":"http://arxiv.org/abs/2406.16852v2","updated":"2024-07-01T02:59:29Z","published":"2024-06-24T17:58:06Z","title":"Long Context Transfer from Language to Vision","summary":" Video sequences offer valuable temporal information, but existing large\nmultimodal models (LMMs) fall short in understanding extremely long videos.\nMany works address this by reducing the number of visual tokens using visual\nresamplers. Alternatively, in this paper, we approach this problem from the\nperspective of the language model. By simply extrapolating the context length\nof the language backbone, we enable LMMs to comprehend orders of magnitude more\nvisual tokens without any video training. We call this phenomenon long context\ntransfer and carefully ablate its properties. To effectively measure LMMs'\nability to generalize to long contexts in the vision modality, we develop\nV-NIAH (Visual Needle-In-A-Haystack), a purely synthetic long vision benchmark\ninspired by the language model's NIAH test. Our proposed Long Video Assistant\n(LongVA) can process 2000 frames or over 200K visual tokens without additional\ncomplexities. With its extended context length, LongVA achieves\nstate-of-the-art performance on Video-MME among 7B-scale models by densely\nsampling more input frames. Our work is open-sourced at\nhttps://github.com/EvolvingLMMs-Lab/LongVA.\n","authors":["Peiyuan Zhang","Kaichen Zhang","Bo Li","Guangtao Zeng","Jingkang Yang","Yuanhan Zhang","Ziyue Wang","Haoran Tan","Chunyuan Li","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2406.16852v2.pdf","comment":"Code, demo, and models are available at\n https://github.com/EvolvingLMMs-Lab/LongVA"},{"id":"http://arxiv.org/abs/2406.18070v4","updated":"2024-07-01T02:44:11Z","published":"2024-06-26T05:01:37Z","title":"EgoVideo: Exploring Egocentric Foundation Model and Downstream\n Adaptation","summary":" In this report, we present our solutions to the EgoVis Challenges in CVPR\n2024, including five tracks in the Ego4D challenge and three tracks in the\nEPIC-Kitchens challenge. Building upon the video-language two-tower model and\nleveraging our meticulously organized egocentric video data, we introduce a\nnovel foundation model called EgoVideo. This model is specifically designed to\ncater to the unique characteristics of egocentric videos and provides strong\nsupport for our competition submissions. In the Ego4D challenges, we tackle\nvarious tasks including Natural Language Queries, Step Grounding, Moment\nQueries, Short-term Object Interaction Anticipation, and Long-term Action\nAnticipation. In addition, we also participate in the EPIC-Kitchens challenge,\nwhere we engage in the Action Recognition, Multiple Instance Retrieval, and\nDomain Adaptation for Action Recognition tracks. By adapting EgoVideo to these\ndiverse tasks, we showcase its versatility and effectiveness in different\negocentric video analysis scenarios, demonstrating the powerful representation\nability of EgoVideo as an egocentric foundation model. Our codebase and\npretrained models are publicly available at\nhttps://github.com/OpenGVLab/EgoVideo.\n","authors":["Baoqi Pei","Guo Chen","Jilan Xu","Yuping He","Yicheng Liu","Kanghua Pan","Yifei Huang","Yali Wang","Tong Lu","Limin Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2406.18070v4.pdf","comment":"Champion solutions in the EgoVis CVPR 2024 workshop"},{"id":"http://arxiv.org/abs/2405.19387v2","updated":"2024-07-01T02:31:53Z","published":"2024-05-29T17:56:31Z","title":"Video Anomaly Detection in 10 Years: A Survey and Outlook","summary":" Video anomaly detection (VAD) holds immense importance across diverse domains\nsuch as surveillance, healthcare, and environmental monitoring. While numerous\nsurveys focus on conventional VAD methods, they often lack depth in exploring\nspecific approaches and emerging trends. This survey explores deep\nlearning-based VAD, expanding beyond traditional supervised training paradigms\nto encompass emerging weakly supervised, self-supervised, and unsupervised\napproaches. A prominent feature of this review is the investigation of core\nchallenges within the VAD paradigms including large-scale datasets, features\nextraction, learning methods, loss functions, regularization, and anomaly score\nprediction. Moreover, this review also investigates the vision language models\n(VLMs) as potent feature extractors for VAD. VLMs integrate visual data with\ntextual descriptions or spoken language from videos, enabling a nuanced\nunderstanding of scenes crucial for anomaly detection. By addressing these\nchallenges and proposing future research directions, this review aims to foster\nthe development of robust and efficient VAD systems leveraging the capabilities\nof VLMs for enhanced anomaly detection in complex real-world scenarios. This\ncomprehensive analysis seeks to bridge existing knowledge gaps, provide\nresearchers with valuable insights, and contribute to shaping the future of VAD\nresearch.\n","authors":["Moshira Abdalla","Sajid Javed","Muaz Al Radi","Anwaar Ulhaq","Naoufel Werghi"],"pdf_url":"https://arxiv.org/pdf/2405.19387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20469v2","updated":"2024-07-01T02:28:26Z","published":"2024-05-30T20:37:34Z","title":"Is Synthetic Data all We Need? Benchmarking the Robustness of Models\n Trained with Synthetic Images","summary":" A long-standing challenge in developing machine learning approaches has been\nthe lack of high-quality labeled data. Recently, models trained with purely\nsynthetic data, here termed synthetic clones, generated using large-scale\npre-trained diffusion models have shown promising results in overcoming this\nannotation bottleneck. As these synthetic clone models progress, they are\nlikely to be deployed in challenging real-world settings, yet their suitability\nremains understudied. Our work addresses this gap by providing the first\nbenchmark for three classes of synthetic clone models, namely supervised,\nself-supervised, and multi-modal ones, across a range of robustness measures.\nWe show that existing synthetic self-supervised and multi-modal clones are\ncomparable to or outperform state-of-the-art real-image baselines for a range\nof robustness metrics - shape bias, background bias, calibration, etc. However,\nwe also find that synthetic clones are much more susceptible to adversarial and\nreal-world noise than models trained with real data. To address this, we find\nthat combining both real and synthetic data further increases the robustness,\nand that the choice of prompt used for generating synthetic images plays an\nimportant part in the robustness of synthetic clones.\n","authors":["Krishnakant Singh","Thanush Navaratnam","Jannik Holmer","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2405.20469v2.pdf","comment":"Accepted at CVPR 2024 Workshop: SyntaGen-Harnessing Generative Models\n for Synthetic Visual Datasets. Project page at\n https://synbenchmark.github.io/SynCloneBenchmark Comments: Fix typo in Fig. 1"},{"id":"http://arxiv.org/abs/2405.18334v3","updated":"2024-07-01T02:10:50Z","published":"2024-05-28T16:28:51Z","title":"SketchQL Demonstration: Zero-shot Video Moment Querying with Sketches","summary":" In this paper, we will present SketchQL, a video database management system\n(VDBMS) for retrieving video moments with a sketch-based query interface. This\nnovel interface allows users to specify object trajectory events with simple\nmouse drag-and-drop operations. Users can use trajectories of single objects as\nbuilding blocks to compose complex events. Using a pre-trained model that\nencodes trajectory similarity, SketchQL achieves zero-shot video moments\nretrieval by performing similarity searches over the video to identify clips\nthat are the most similar to the visual query. In this demonstration, we\nintroduce the graphic user interface of SketchQL and detail its functionalities\nand interaction mechanisms. We also demonstrate the end-to-end usage of\nSketchQL from query composition to video moments retrieval using real-world\nscenarios.\n","authors":["Renzhi Wu","Pramod Chunduri","Dristi J Shah","Ashmitha Julius Aravind","Ali Payani","Xu Chu","Joy Arulraj","Kexin Rong"],"pdf_url":"https://arxiv.org/pdf/2405.18334v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19602v2","updated":"2024-07-01T02:10:16Z","published":"2024-06-28T02:18:16Z","title":"A Survey on Deep Clustering: From the Prior Perspective","summary":" Facilitated by the powerful feature extraction ability of neural networks,\ndeep clustering has achieved great success in analyzing high-dimensional and\ncomplex real-world data. The performance of deep clustering methods is affected\nby various factors such as network structures and learning objectives. However,\nas pointed out in this survey, the essence of deep clustering lies in the\nincorporation and utilization of prior knowledge, which is largely ignored by\nexisting works. From pioneering deep clustering methods based on data structure\nassumptions to recent contrastive clustering methods based on data augmentation\ninvariances, the development of deep clustering intrinsically corresponds to\nthe evolution of prior knowledge. In this survey, we provide a comprehensive\nreview of deep clustering methods by categorizing them into six types of prior\nknowledge. We find that in general the prior innovation follows two trends,\nnamely, i) from mining to constructing, and ii) from internal to external.\nBesides, we provide a benchmark on five widely-used datasets and analyze the\nperformance of methods with diverse priors. By providing a novel prior\nknowledge perspective, we hope this survey could provide some novel insights\nand inspire future research in the deep clustering community.\n","authors":["Yiding Lu","Haobin Li","Yunfan Li","Yijie Lin","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2406.19602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04940v3","updated":"2024-07-01T02:06:31Z","published":"2024-05-08T10:15:04Z","title":"Harnessing the Power of MLLMs for Transferable Text-to-Image Person ReID","summary":" Text-to-image person re-identification (ReID) retrieves pedestrian images\naccording to textual descriptions. Manually annotating textual descriptions is\ntime-consuming, restricting the scale of existing datasets and therefore the\ngeneralization ability of ReID models. As a result, we study the transferable\ntext-to-image ReID problem, where we train a model on our proposed large-scale\ndatabase and directly deploy it to various datasets for evaluation. We obtain\nsubstantial training data via Multi-modal Large Language Models (MLLMs).\nMoreover, we identify and address two key challenges in utilizing the obtained\ntextual descriptions. First, an MLLM tends to generate descriptions with\nsimilar structures, causing the model to overfit specific sentence patterns.\nThus, we propose a novel method that uses MLLMs to caption images according to\nvarious templates. These templates are obtained using a multi-turn dialogue\nwith a Large Language Model (LLM). Therefore, we can build a large-scale\ndataset with diverse textual descriptions. Second, an MLLM may produce\nincorrect descriptions. Hence, we introduce a novel method that automatically\nidentifies words in a description that do not correspond with the image. This\nmethod is based on the similarity between one text and all patch token\nembeddings in the image. Then, we mask these words with a larger probability in\nthe subsequent training epoch, alleviating the impact of noisy textual\ndescriptions. The experimental results demonstrate that our methods\nsignificantly boost the direct transfer text-to-image ReID performance.\nBenefiting from the pre-trained model weights, we also achieve state-of-the-art\nperformance in the traditional evaluation settings.\n","authors":["Wentao Tan","Changxing Ding","Jiayu Jiang","Fei Wang","Yibing Zhan","Dapeng Tao"],"pdf_url":"https://arxiv.org/pdf/2405.04940v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2406.16695v2","updated":"2024-07-01T01:55:12Z","published":"2024-06-24T14:58:17Z","title":"Geometry-Aware Score Distillation via 3D Consistent Noising and Gradient\n Consistency Modeling","summary":" Score distillation sampling (SDS), the methodology in which the score from\npretrained 2D diffusion models is distilled into 3D representation, has\nrecently brought significant advancements in text-to-3D generation task.\nHowever, this approach is still confronted with critical geometric\ninconsistency problems such as the Janus problem. Starting from a hypothesis\nthat such inconsistency problems may be induced by multiview inconsistencies\nbetween 2D scores predicted from various viewpoints, we introduce GSD, a simple\nand general plug-and-play framework for incorporating 3D consistency and\ntherefore geometry awareness into the SDS process. Our methodology is composed\nof three components: 3D consistent noising, designed to produce 3D consistent\nnoise maps that perfectly follow the standard Gaussian distribution,\ngeometry-based gradient warping for identifying correspondences between\npredicted gradients of different viewpoints, and novel gradient consistency\nloss to optimize the scene geometry toward producing more consistent gradients.\nWe demonstrate that our method significantly improves performance, successfully\naddressing the geometric inconsistency problems in text-to-3D generation task\nwith minimal computation cost and being compatible with existing score\ndistillation-based models. Our project page is available at\nhttps://ku-cvlab.github.io/GSD/.\n","authors":["Min-Seop Kwak","Donghoon Ahn","Ines Hyeonsu Kim","Jin-Hwa Kim","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2406.16695v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19941v2","updated":"2024-07-01T01:06:59Z","published":"2024-06-28T14:17:16Z","title":"GRACE: Graph-Regularized Attentive Convolutional Entanglement with\n Laplacian Smoothing for Robust DeepFake Video Detection","summary":" As DeepFake video manipulation techniques escalate, posing profound threats,\nthe urgent need to develop efficient detection strategies is underscored.\nHowever, one particular issue lies with facial images being mis-detected, often\noriginating from degraded videos or adversarial attacks, leading to unexpected\ntemporal artifacts that can undermine the efficacy of DeepFake video detection\ntechniques. This paper introduces a novel method for robust DeepFake video\ndetection, harnessing the power of the proposed Graph-Regularized Attentive\nConvolutional Entanglement (GRACE) based on the graph convolutional network\nwith graph Laplacian to address the aforementioned challenges. First,\nconventional Convolution Neural Networks are deployed to perform spatiotemporal\nfeatures for the entire video. Then, the spatial and temporal features are\nmutually entangled by constructing a graph with sparse constraint, enforcing\nessential features of valid face images in the noisy face sequences remaining,\nthus augmenting stability and performance for DeepFake video detection.\nFurthermore, the Graph Laplacian prior is proposed in the graph convolutional\nnetwork to remove the noise pattern in the feature space to further improve the\nperformance. Comprehensive experiments are conducted to illustrate that our\nproposed method delivers state-of-the-art performance in DeepFake video\ndetection under noisy face sequences. The source code is available at\nhttps://github.com/ming053l/GRACE.\n","authors":["Chih-Chung Hsu","Shao-Ning Chen","Mei-Hsuan Wu","Yi-Fang Wang","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2406.19941v2.pdf","comment":"Submitted to TPAMI 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2406.18984v2","updated":"2024-07-01T12:04:39Z","published":"2024-06-27T08:26:20Z","title":"Amplify Graph Learning for Recommendation via Sparsity Completion","summary":" Graph learning models have been widely deployed in collaborative filtering\n(CF) based recommendation systems. Due to the issue of data sparsity, the graph\nstructure of the original input lacks potential positive preference edges,\nwhich significantly reduces the performance of recommendations. In this paper,\nwe study how to enhance the graph structure for CF more effectively, thereby\noptimizing the representation of graph nodes. Previous works introduced matrix\ncompletion techniques into CF, proposing the use of either stochastic\ncompletion methods or superficial structure completion to address this issue.\nHowever, most of these approaches employ random numerical filling that lack\ncontrol over noise perturbations and limit the in-depth exploration of\nhigher-order interaction features of nodes, resulting in biased graph\nrepresentations.\n In this paper, we propose an Amplify Graph Learning framework based on\nSparsity Completion (called AGL-SC). First, we utilize graph neural network to\nmine direct interaction features between user and item nodes, which are used as\nthe inputs of the encoder. Second, we design a factorization-based method to\nmine higher-order interaction features. These features serve as perturbation\nfactors in the latent space of the hidden layer to facilitate generative\nenhancement. Finally, by employing the variational inference, the above\nmulti-order features are integrated to implement the completion and enhancement\nof missing graph structures. We conducted benchmark and strategy experiments on\nfour real-world datasets related to recommendation tasks. The experimental\nresults demonstrate that AGL-SC significantly outperforms the state-of-the-art\nmethods.\n","authors":["Peng Yuan","Haojie Li","Minying Fang","Xu Yu","Yongjing Hao","Junwei Du"],"pdf_url":"https://arxiv.org/pdf/2406.18984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05074v2","updated":"2024-07-01T03:11:08Z","published":"2023-05-08T22:23:32Z","title":"Autumn: A Scalable Read Optimized LSM-tree based Key-Value Stores with\n Fast Point and Range Read Speed","summary":" The Log Structured Merge Trees (LSM-tree) based key-value stores are widely\nused in many storage systems to support a variety of operations such as\nupdates, point reads, and range reads. Traditionally, LSM-tree's merge policy\norganizes data into multiple levels of exponentially increasing capacity to\nsupport high-speed writes. However, we contend that the traditional merge\npolicies are not optimized for reads. In this work, we present Autumn, a\nscalable and read optimized LSM-tree based key-value stores with minimal point\nand range read cost. The key idea in improving the read performance is to\ndynamically adjust the capacity ratio between two adjacent levels as more data\nare stored. As a result, smaller levels gradually increase their capacities and\nmerge more often. In particular, the point and range read cost improves from\nthe previous best known $O(logN)$ complexity to $O(\\sqrt{logN})$ in Autumn by\napplying the novel Garnering merge policy. While Garnering merge policy\noptimizes for both point reads and range reads, it maintains high performance\nfor updates. Moreover, to further improve the update costs, Autumn uses a small\namount of bounded space of DRAM to pin/keep the first level of LSM-tree. We\nimplemented Autumn on top of LevelDB and experimentally showcases the gain in\nperformance for real world workloads.\n","authors":["Fuheng Zhao","Zach Miller","Leron Reznikov","Divyakant Agrawal","Amr El Abbadi"],"pdf_url":"https://arxiv.org/pdf/2305.05074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14572v3","updated":"2024-07-01T01:33:10Z","published":"2024-06-13T17:53:29Z","title":"Bioptic -- A Target-Agnostic Potency-Based Small Molecules Search Engine","summary":" Recent successes in virtual screening have been made possible by large models\nand extensive chemical libraries. However, combining these elements is\nchallenging: the larger the model, the more expensive it is to run, making\nultra-large libraries unfeasible. To address this, we developed a\ntarget-agnostic, efficacy-based molecule search model, which allows us to find\nstructurally dissimilar molecules with similar biological activities. We used\nthe best practices to design fast retrieval system, based on\nprocessor-optimized SIMD instructions, enabling us to screen the ultra-large\n40B Enamine REAL library with 100\\% recall rate. We extensively benchmarked our\nmodel and several state-of-the-art models for both speed performance and\nretrieval quality of novel molecules.\n","authors":["Vlad Vinogradov","Ivan Izmailov","Simon Steshin","Kong T. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.14572v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01732v1","updated":"2024-07-01T19:05:40Z","published":"2024-07-01T19:05:40Z","title":"Investigating Nudges toward Related Sellers on E-commerce Marketplaces:\n A Case Study on Amazon","summary":" E-commerce marketplaces provide business opportunities to millions of sellers\nworldwide. Some of these sellers have special relationships with the\nmarketplace by virtue of using their subsidiary services (e.g., fulfillment\nand/or shipping services provided by the marketplace) -- we refer to such\nsellers collectively as Related Sellers. When multiple sellers offer to sell\nthe same product, the marketplace helps a customer in selecting an offer (by a\nseller) through (a) a default offer selection algorithm, (b) showing features\nabout each of the offers and the corresponding sellers (price, seller\nperformance metrics, seller's number of ratings etc.), and (c) finally\nevaluating the sellers along these features. In this paper, we perform an\nend-to-end investigation into how the above apparatus can nudge customers\ntoward the Related Sellers on Amazon's four different marketplaces in India,\nUSA, Germany and France. We find that given explicit choices, customers'\npreferred offers and algorithmically selected offers can be significantly\ndifferent. We highlight that Amazon is adopting different performance metric\nevaluation policies for different sellers, potentially benefiting Related\nSellers. For instance, such policies result in notable discrepancy between the\nactual performance metric and the presented performance metric of Related\nSellers. We further observe that among the seller-centric features visible to\ncustomers, sellers' number of ratings influences their decisions the most, yet\nit may not reflect the true quality of service by the seller, rather reflecting\nthe scale at which the seller operates, thereby implicitly steering customers\ntoward larger Related Sellers. Moreover, when customers are shown the rectified\nmetrics for the different sellers, their preference toward Related Sellers is\nalmost halved.\n","authors":["Abhisek Dash","Abhijnan Chakraborty","Saptarshi Ghosh","Animesh Mukherjee","Krishna P. Gummadi"],"pdf_url":"https://arxiv.org/pdf/2407.01732v1.pdf","comment":"This work has been accepted for presentation at the ACM Conference on\n Computer-Supported Cooperative Work and Social Computing (CSCW) 2024. It will\n appear in Proceedings of the ACM on Human-Computer Interaction"},{"id":"http://arxiv.org/abs/2407.01433v1","updated":"2024-07-01T16:23:45Z","published":"2024-07-01T16:23:45Z","title":"POST: Email Archival, Processing and Flagging Stack for Incident\n Responders","summary":" Phishing is one of the main points of compromise, with email security and\nawareness being estimated at \\$50-100B in 2022. There is great need for email\nforensics capability to quickly search for malicious content. A novel solution\nPOST is proposed. POST is an API driven serverless email archival, processing,\nand flagging workflow for both large and small organizations that collects and\nparses all email, flags emails using state of the art Natural Language\nProcessing and Machine Learning, allows full email searching on every aspect of\nan email, and provides a cost savings of up to 68.6%.\n","authors":["Jeffrey Fairbanks"],"pdf_url":"https://arxiv.org/pdf/2407.01433v1.pdf","comment":"This work was performed under the auspices of the U.S. Department of\n Energy by Lawrence Livermore National Laboratory under Contract\n DE-AC52-07NA27344. For further information or questions please reach out to\n fairbanks6@llnl.gov"},{"id":"http://arxiv.org/abs/2407.01424v1","updated":"2024-07-01T16:14:25Z","published":"2024-07-01T16:14:25Z","title":"A Global-Local Attention Mechanism for Relation Classification","summary":" Relation classification, a crucial component of relation extraction, involves\nidentifying connections between two entities. Previous studies have\npredominantly focused on integrating the attention mechanism into relation\nclassification at a global scale, overlooking the importance of the local\ncontext. To address this gap, this paper introduces a novel global-local\nattention mechanism for relation classification, which enhances global\nattention with a localized focus. Additionally, we propose innovative hard and\nsoft localization mechanisms to identify potential keywords for local\nattention. By incorporating both hard and soft localization strategies, our\napproach offers a more nuanced and comprehensive understanding of the\ncontextual cues that contribute to effective relation classification. Our\nexperimental results on the SemEval-2010 Task 8 dataset highlight the superior\nperformance of our method compared to previous attention-based approaches in\nrelation classification.\n","authors":["Yiping Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01424v1.pdf","comment":"This paper has been accepted by the 2024 20th International\n Conference on Natural Computation, Fuzzy Systems and Knowledge Discovery\n (ICNC-FSKD)"},{"id":"http://arxiv.org/abs/2407.01403v1","updated":"2024-07-01T15:53:29Z","published":"2024-07-01T15:53:29Z","title":"Optimization of Retrieval-Augmented Generation Context with Outlier\n Detection","summary":" In this paper, we focus on methods to reduce the size and improve the quality\nof the prompt context required for question-answering systems. Attempts to\nincrease the number of retrieved chunked documents and thereby enlarge the\ncontext related to the query can significantly complicate the processing and\ndecrease the performance of a Large Language Model (LLM) when generating\nresponses to queries. It is well known that a large set of documents retrieved\nfrom a database in response to a query may contain irrelevant information,\nwhich often leads to hallucinations in the resulting answers. Our goal is to\nselect the most semantically relevant documents, treating the discarded ones as\noutliers. We propose and evaluate several methods for identifying outliers by\ncreating features that utilize the distances of embedding vectors, retrieved\nfrom the vector database, to both the centroid and the query vectors. The\nmethods were evaluated by comparing the similarities of the retrieved LLM\nresponses to ground-truth answers obtained using the OpenAI GPT-4o model. It\nwas found that the greatest improvements were achieved with increasing\ncomplexity of the questions and answers.\n","authors":["Vitaly Bulgakov"],"pdf_url":"https://arxiv.org/pdf/2407.01403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01373v1","updated":"2024-07-01T15:25:31Z","published":"2024-07-01T15:25:31Z","title":"Evaluation of Temporal Change in IR Test Collections","summary":" Information retrieval systems have been evaluated using the Cranfield\nparadigm for many years. This paradigm allows a systematic, fair, and\nreproducible evaluation of different retrieval methods in fixed experimental\nenvironments. However, real-world retrieval systems must cope with dynamic\nenvironments and temporal changes that affect the document collection, topical\ntrends, and the individual user's perception of what is considered relevant.\nYet, the temporal dimension in IR evaluations is still understudied.\n To this end, this work investigates how the temporal generalizability of\neffectiveness evaluations can be assessed. As a conceptual model, we generalize\nCranfield-type experiments to the temporal context by classifying the change in\nthe essential components according to the create, update, and delete operations\nof persistent storage known from CRUD. From the different types of change\ndifferent evaluation scenarios are derived and it is outlined what they imply.\nBased on these scenarios, renowned state-of-the-art retrieval systems are\ntested and it is investigated how the retrieval effectiveness changes on\ndifferent levels of granularity.\n We show that the proposed measures can be well adapted to describe the\nchanges in the retrieval results. The experiments conducted confirm that the\nretrieval effectiveness strongly depends on the evaluation scenario\ninvestigated. We find that not only the average retrieval performance of single\nsystems but also the relative system performance are strongly affected by the\ncomponents that change and to what extent these components changed.\n","authors":["Jüri Keller","Timo Breuer","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2407.01373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01102v1","updated":"2024-07-01T09:09:27Z","published":"2024-07-01T09:09:27Z","title":"BERGEN: A Benchmarking Library for Retrieval-Augmented Generation","summary":" Retrieval-Augmented Generation allows to enhance Large Language Models with\nexternal knowledge. In response to the recent popularity of generative LLMs,\nmany RAG approaches have been proposed, which involve an intricate number of\ndifferent configurations such as evaluation datasets, collections, metrics,\nretrievers, and LLMs. Inconsistent benchmarking poses a major challenge in\ncomparing approaches and understanding the impact of each component in the\npipeline. In this work, we study best practices that lay the groundwork for a\nsystematic evaluation of RAG and present BERGEN, an end-to-end library for\nreproducible research standardizing RAG experiments. In an extensive study\nfocusing on QA, we benchmark different state-of-the-art retrievers, rerankers,\nand LLMs. Additionally, we analyze existing RAG metrics and datasets. Our\nopen-source library BERGEN is available under\n\\url{https://github.com/naver/bergen}.\n","authors":["David Rau","Hervé Déjean","Nadezhda Chirkova","Thibault Formal","Shuai Wang","Vassilina Nikoulina","Stéphane Clinchant"],"pdf_url":"https://arxiv.org/pdf/2407.01102v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2407.01069v1","updated":"2024-07-01T08:19:19Z","published":"2024-07-01T08:19:19Z","title":"Deep Domain Specialisation for single-model multi-domain learning to\n rank","summary":" Information Retrieval (IR) practitioners often train separate ranking models\nfor different domains (geographic regions, languages, stores, websites,...) as\nit is believed that exclusively training on in-domain data yields the best\nperformance when sufficient data is available. Despite their performance gains,\ntraining multiple models comes at a higher cost to train, maintain and update\ncompared to having only a single model responsible for all domains. Our work\nexplores consolidated ranking models that serve multiple domains. Specifically,\nwe propose a novel architecture of Deep Domain Specialisation (DDS) to\nconsolidate multiple domains into a single model. We compare our proposal\nagainst Deep Domain Adaptation (DDA) and a set of baseline for multi-domain\nmodels. In our experiments, DDS performed the best overall while requiring\nfewer parameters per domain as other baselines. We show the efficacy of our\nmethod both with offline experimentation and on a large-scale online experiment\non Amazon customer traffic.\n","authors":["Paul Missault","Abdelmaseeh Felfel"],"pdf_url":"https://arxiv.org/pdf/2407.01069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00942v1","updated":"2024-07-01T03:50:23Z","published":"2024-07-01T03:50:23Z","title":"ProductAgent: Benchmarking Conversational Product Search Agent with\n Asking Clarification Questions","summary":" This paper introduces the task of product demand clarification within an\ne-commercial scenario, where the user commences the conversation with ambiguous\nqueries and the task-oriented agent is designed to achieve more accurate and\ntailored product searching by asking clarification questions. To address this\ntask, we propose ProductAgent, a conversational information seeking agent\nequipped with abilities of strategic clarification question generation and\ndynamic product retrieval. Specifically, we develop the agent with strategies\nfor product feature summarization, query generation, and product retrieval.\nFurthermore, we propose the benchmark called PROCLARE to evaluate the agent's\nperformance both automatically and qualitatively with the aid of a LLM-driven\nuser simulator. Experiments show that ProductAgent interacts positively with\nthe user and enhances retrieval performance with increasing dialogue turns,\nwhere user demands become gradually more explicit and detailed. All the source\ncodes will be released after the review anonymity period.\n","authors":["Jingheng Ye","Yong Jiang","Xiaobin Wang","Yinghui Li","Yangning Li","Hai-Tao Zheng","Pengjun Xie","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2407.00942v1.pdf","comment":"17 pages, 13 tables, 6 figures. Under review"},{"id":"http://arxiv.org/abs/2407.00912v1","updated":"2024-07-01T02:36:03Z","published":"2024-07-01T02:36:03Z","title":"Unified Dual-Intent Translation for Joint Modeling of Search and\n Recommendation","summary":" Recommendation systems, which assist users in discovering their preferred\nitems among numerous options, have served billions of users across various\nonline platforms. Intuitively, users' interactions with items are highly driven\nby their unchanging inherent intents (e.g., always preferring high-quality\nitems) and changing demand intents (e.g., wanting a T-shirt in summer but a\ndown jacket in winter). However, both types of intents are implicitly expressed\nin recommendation scenario, posing challenges in leveraging them for accurate\nintent-aware recommendations. Fortunately, in search scenario, often found\nalongside recommendation on the same online platform, users express their\ndemand intents explicitly through their query words. Intuitively, in both\nscenarios, a user shares the same inherent intent and the interactions may be\ninfluenced by the same demand intent. It is therefore feasible to utilize the\ninteraction data from both scenarios to reinforce the dual intents for joint\nintent-aware modeling. But the joint modeling should deal with two problems: 1)\naccurately modeling users' implicit demand intents in recommendation; 2)\nmodeling the relation between the dual intents and the interactive items. To\naddress these problems, we propose a novel model named Unified Dual-Intents\nTranslation for joint modeling of Search and Recommendation (UDITSR). To\naccurately simulate users' demand intents in recommendation, we utilize real\nqueries from search data as supervision information to guide its generation. To\nexplicitly model the relation among the triplet , we propose a dual-intent translation propagation\nmechanism to learn the triplet in the same semantic space via embedding\ntranslations. Extensive experiments demonstrate that UDITSR outperforms SOTA\nbaselines both in search and recommendation tasks.\n","authors":["Yuting Zhang","Yiqing Wu","Ruidong Han","Ying Sun","Yongchun Zhu","Xiang Li","Wei Lin","Fuzhen Zhuang","Zhulin An","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2407.00912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00909v1","updated":"2024-07-01T02:27:54Z","published":"2024-07-01T02:27:54Z","title":"Heterogeneous Graph-based Framework with Disentangled Representations\n Learning for Multi-target Cross Domain Recommendation","summary":" CDR (Cross-Domain Recommendation), i.e., leveraging information from multiple\ndomains, is a critical solution to data sparsity problem in recommendation\nsystem. The majority of previous research either focused on single-target CDR\n(STCDR) by utilizing data from the source domains to improve the model's\nperformance on the target domain, or applied dual-target CDR (DTCDR) by\nintegrating data from the source and target domains. In addition, multi-target\nCDR (MTCDR) is a generalization of DTCDR, which is able to capture the link\namong different domains. In this paper we present HGDR (Heterogeneous\nGraph-based Framework with Disentangled Representations Learning), an\nend-to-end heterogeneous network architecture where graph convolutional layers\nare applied to model relations among different domains, meanwhile utilizes the\nidea of disentangling representation for domain-shared and domain-specifc\ninformation. First, a shared heterogeneous graph is generated by gathering\nusers and items from several domains without any further side information.\nSecond, we use HGDR to compute disentangled representations for users and items\nin all domains.Experiments on real-world datasets and online A/B tests prove\nthat our proposed model can transmit information among domains effectively and\nreach the SOTA performance.\n","authors":["Xiaopeng Liu","Juan Zhang","Chongqi Ren","Shenghui Xu","Zhaoming Pan","Zhimin Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.00909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02536v1","updated":"2024-07-01T21:03:04Z","published":"2024-07-01T21:03:04Z","title":"Reducing False Discoveries in Statistically-Significant\n Regional-Colocation Mining: A Summary of Results","summary":" Given a set \\emph{S} of spatial feature types, its feature instances, a study\narea, and a neighbor relationship, the goal is to find pairs $<$a region\n($r_{g}$), a subset \\emph{C} of \\emph{S}$>$ such that \\emph{C} is a\nstatistically significant regional-colocation pattern in $r_{g}$. This problem\nis important for applications in various domains including ecology, economics,\nand sociology. The problem is computationally challenging due to the\nexponential number of regional colocation patterns and candidate regions.\nPreviously, we proposed a miner \\cite{10.1145/3557989.3566158} that finds\nstatistically significant regional colocation patterns. However, the numerous\nsimultaneous statistical inferences raise the risk of false discoveries (also\nknown as the multiple comparisons problem) and carry a high computational cost.\nWe propose a novel algorithm, namely, multiple comparisons regional colocation\nminer (MultComp-RCM) which uses a Bonferroni correction. Theoretical analysis,\nexperimental evaluation, and case study results show that the proposed method\nreduces both the false discovery rate and computational cost.\n","authors":["Subhankar Ghosh","Jayant Gupta","Arun Sharma","Shuai An","Shashi Shekhar"],"pdf_url":"https://arxiv.org/pdf/2407.02536v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2406.16793v4","updated":"2024-07-01T17:46:19Z","published":"2024-06-24T16:56:41Z","title":"Adam-mini: Use Fewer Learning Rates To Gain More","summary":" We propose Adam-mini, an optimizer that achieves on-par or better performance\nthan AdamW with 45% to 50% less memory footprint. Adam-mini reduces memory by\ncutting down the learning rate resources in Adam (i.e., $1/\\sqrt{v}$). We find\nthat $\\geq$ 90% of these learning rates in $v$ could be harmlessly removed if\nwe (1) carefully partition the parameters into blocks following our proposed\nprinciple on Hessian structure; (2) assign a single but good learning rate to\neach parameter block. We further find that, for each of these parameter blocks,\nthere exists a single high-quality learning rate that can outperform Adam,\nprovided that sufficient resources are available to search it out. We then\nprovide one cost-effective way to find good learning rates and propose\nAdam-mini. Empirically, we verify that Adam-mini performs on par or better than\nAdamW on various language models sized from 125M to 7B for pre-training,\nsupervised fine-tuning, and RLHF. The reduced memory footprint of Adam-mini\nalso alleviates communication overheads among GPUs and CPUs, thereby increasing\nthroughput. For instance, Adam-mini achieves 49.6% higher throughput than AdamW\nwhen pre-training Llama2-7B on $2\\times$ A800-80GB GPUs, which saves 33%\nwall-clock time for pre-training.\n","authors":["Yushun Zhang","Congliang Chen","Ziniu Li","Tian Ding","Chenwei Wu","Yinyu Ye","Zhi-Quan Luo","Ruoyu Sun"],"pdf_url":"https://arxiv.org/pdf/2406.16793v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17055v2","updated":"2024-07-01T17:29:54Z","published":"2024-06-24T18:15:27Z","title":"Large Language Models Assume People are More Rational than We Really are","summary":" In order for AI systems to communicate effectively with people, they must\nunderstand how we make decisions. However, people's decisions are not always\nrational, so the implicit internal models of human decision-making in Large\nLanguage Models (LLMs) must account for this. Previous empirical evidence seems\nto suggest that these implicit models are accurate -- LLMs offer believable\nproxies of human behavior, acting how we expect humans would in everyday\ninteractions. However, by comparing LLM behavior and predictions to a large\ndataset of human decisions, we find that this is actually not the case: when\nboth simulating and predicting people's choices, a suite of cutting-edge LLMs\n(GPT-4o & 4-Turbo, Llama-3-8B & 70B, Claude 3 Opus) assume that people are more\nrational than we really are. Specifically, these models deviate from human\nbehavior and align more closely with a classic model of rational choice --\nexpected value theory. Interestingly, people also tend to assume that other\npeople are rational when interpreting their behavior. As a consequence, when we\ncompare the inferences that LLMs and people draw from the decisions of others\nusing another psychological dataset, we find that these inferences are highly\ncorrelated. Thus, the implicit decision-making models of LLMs appear to be\naligned with the human expectation that other people will act rationally,\nrather than with how people actually act.\n","authors":["Ryan Liu","Jiayi Geng","Joshua C. Peterson","Ilia Sucholutsky","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2406.17055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19917v3","updated":"2024-07-01T17:26:23Z","published":"2023-10-30T18:29:15Z","title":"Unmasking Bias in AI: A Systematic Review of Bias Detection and\n Mitigation Strategies in Electronic Health Record-based Models","summary":" Objectives: Leveraging artificial intelligence (AI) in conjunction with\nelectronic health records (EHRs) holds transformative potential to improve\nhealthcare. Yet, addressing bias in AI, which risks worsening healthcare\ndisparities, cannot be overlooked. This study reviews methods to detect and\nmitigate diverse forms of bias in AI models developed using EHR data. Methods:\nWe conducted a systematic review following the Preferred Reporting Items for\nSystematic Reviews and Meta-analyses (PRISMA) guidelines, analyzing articles\nfrom PubMed, Web of Science, and IEEE published between January 1, 2010, and\nDec 17, 2023. The review identified key biases, outlined strategies for\ndetecting and mitigating bias throughout the AI model development process, and\nanalyzed metrics for bias assessment. Results: Of the 450 articles retrieved,\n20 met our criteria, revealing six major bias types: algorithmic, confounding,\nimplicit, measurement, selection, and temporal. The AI models were primarily\ndeveloped for predictive tasks in healthcare settings. Four studies\nconcentrated on the detection of implicit and algorithmic biases employing\nfairness metrics like statistical parity, equal opportunity, and predictive\nequity. Sixty proposed various strategies for mitigating biases, especially\ntargeting implicit and selection biases. These strategies, evaluated through\nboth performance (e.g., accuracy, AUROC) and fairness metrics, predominantly\ninvolved data collection and preprocessing techniques like resampling,\nreweighting, and transformation. Discussion: This review highlights the varied\nand evolving nature of strategies to address bias in EHR-based AI models,\nemphasizing the urgent needs for the establishment of standardized,\ngeneralizable, and interpretable methodologies to foster the creation of\nethical AI systems that promote fairness and equity in healthcare.\n","authors":["Feng Chen","Liqin Wang","Julie Hong","Jiaqi Jiang","Li Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.19917v3.pdf","comment":"Published in JAMIA Volume 31, Issue 5, May 2024"},{"id":"http://arxiv.org/abs/2308.13320v3","updated":"2024-07-01T17:14:27Z","published":"2023-08-25T11:49:51Z","title":"Fine-tuning can cripple your foundation model; preserving features may\n be the solution","summary":" Pre-trained foundation models, due to their enormous capacity and exposure to\nvast amounts of data during pre-training, are known to have learned plenty of\nreal-world concepts. An important step in making these pre-trained models\neffective on downstream tasks is to fine-tune them on related datasets. While\nvarious fine-tuning methods have been devised and have been shown to be highly\neffective, we observe that a fine-tuned model's ability to recognize concepts\non tasks $\\textit{different}$ from the downstream one is reduced significantly\ncompared to its pre-trained counterpart. This is an undesirable effect of\nfine-tuning as a substantial amount of resources was used to learn these\npre-trained concepts in the first place. We call this phenomenon ''concept\nforgetting'' and via experiments show that most end-to-end fine-tuning\napproaches suffer heavily from this side effect. To this end, we propose a\nsimple fix to this problem by designing a new fine-tuning method called\n$\\textit{LDIFS}$ (short for $\\ell_2$ distance in feature space) that, while\nlearning new concepts related to the downstream task, allows a model to\npreserve its pre-trained knowledge as well. Through extensive experiments on 10\nfine-tuning tasks we show that $\\textit{LDIFS}$ significantly reduces concept\nforgetting. Additionally, we show that LDIFS is highly effective in performing\ncontinual fine-tuning on a sequence of tasks as well, in comparison with both\nfine-tuning as well as continual learning baselines.\n","authors":["Jishnu Mukhoti","Yarin Gal","Philip H. S. Torr","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2308.13320v3.pdf","comment":"Published in TMLR: https://openreview.net/forum?id=kfhoeZCeW7"},{"id":"http://arxiv.org/abs/2309.05196v3","updated":"2024-07-01T16:36:30Z","published":"2023-09-11T02:16:47Z","title":"Does Writing with Language Models Reduce Content Diversity?","summary":" Large language models (LLMs) have led to a surge in collaborative writing\nwith model assistance. As different users incorporate suggestions from the same\nmodel, there is a risk of decreased diversity in the produced content,\npotentially limiting diverse perspectives in public discourse. In this work, we\nmeasure the impact of co-writing on diversity via a controlled experiment,\nwhere users write argumentative essays in three setups -- using a base LLM\n(GPT3), a feedback-tuned LLM (InstructGPT), and writing without model help. We\ndevelop a set of diversity metrics and find that writing with InstructGPT (but\nnot the GPT3) results in a statistically significant reduction in diversity.\nSpecifically, it increases the similarity between the writings of different\nauthors and reduces the overall lexical and content diversity. We additionally\nfind that this effect is mainly attributable to InstructGPT contributing less\ndiverse text to co-written essays. In contrast, the user-contributed text\nremains unaffected by model collaboration. This suggests that the recent\nimprovement in generation quality from adapting models to human feedback might\ncome at the cost of more homogeneous and less diverse content.\n","authors":["Vishakh Padmakumar","He He"],"pdf_url":"https://arxiv.org/pdf/2309.05196v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2311.02115v2","updated":"2024-07-01T16:30:53Z","published":"2023-11-03T01:37:28Z","title":"Towards objective and systematic evaluation of bias in artificial\n intelligence for medical imaging","summary":" Artificial intelligence (AI) models trained using medical images for clinical\ntasks often exhibit bias in the form of disparities in performance between\nsubgroups. Since not all sources of biases in real-world medical imaging data\nare easily identifiable, it is challenging to comprehensively assess how those\nbiases are encoded in models, and how capable bias mitigation methods are at\nameliorating performance disparities. In this article, we introduce a novel\nanalysis framework for systematically and objectively investigating the impact\nof biases in medical images on AI models. We developed and tested this\nframework for conducting controlled in silico trials to assess bias in medical\nimaging AI using a tool for generating synthetic magnetic resonance images with\nknown disease effects and sources of bias. The feasibility is showcased by\nusing three counterfactual bias scenarios to measure the impact of simulated\nbias effects on a convolutional neural network (CNN) classifier and the\nefficacy of three bias mitigation strategies. The analysis revealed that the\nsimulated biases resulted in expected subgroup performance disparities when the\nCNN was trained on the synthetic datasets. Moreover, reweighing was identified\nas the most successful bias mitigation strategy for this setup, and we\ndemonstrated how explainable AI methods can aid in investigating the\nmanifestation of bias in the model using this framework. Developing fair AI\nmodels is a considerable challenge given that many and often unknown sources of\nbiases can be present in medical imaging datasets. In this work, we present a\nnovel methodology to objectively study the impact of biases and mitigation\nstrategies on deep learning pipelines, which can support the development of\nclinical AI that is robust and responsible.\n","authors":["Emma A. M. Stanley","Raissa Souza","Anthony Winder","Vedant Gulve","Kimberly Amador","Matthias Wilms","Nils D. Forkert"],"pdf_url":"https://arxiv.org/pdf/2311.02115v2.pdf","comment":"Published in the Journal of the American Medical Informatics\n Association"},{"id":"http://arxiv.org/abs/2406.14220v2","updated":"2024-07-01T16:30:23Z","published":"2024-06-20T11:40:12Z","title":"Evaluation of Deep Learning Semantic Segmentation for Land Cover Mapping\n on Multispectral, Hyperspectral and High Spatial Aerial Imagery","summary":" In the rise of climate change, land cover mapping has become such an urgent\nneed in environmental monitoring. The accuracy of land cover classification has\ngotten increasingly based on the improvement of remote sensing data. Land cover\nclassification using satellite imageries has been explored and become more\nprevalent in recent years, but the methodologies remain some drawbacks of\nsubjective and time-consuming. Some deep learning techniques have been utilized\nto overcome these limitations. However, most studies implemented just one image\ntype to evaluate algorithms for land cover mapping. Therefore, our study\nconducted deep learning semantic segmentation in multispectral, hyperspectral,\nand high spatial aerial image datasets for landcover mapping. This research\nimplemented a semantic segmentation method such as Unet, Linknet, FPN, and\nPSPnet for categorizing vegetation, water, and others (i.e., soil and\nimpervious surface). The LinkNet model obtained high accuracy in IoU\n(Intersection Over Union) at 0.92 in all datasets, which is comparable with\nother mentioned techniques. In evaluation with different image types, the\nmultispectral images showed higher performance with the IoU, and F1-score are\n0.993 and 0.997, respectively. Our outcome highlighted the efficiency and broad\napplicability of LinkNet and multispectral image on land cover classification.\nThis research contributes to establishing an approach on landcover segmentation\nvia open source for long-term future application.\n","authors":["Ilham Adi Panuntun","Ying-Nong Chen","Ilham Jamaluddin","Thi Linh Chi Tran"],"pdf_url":"https://arxiv.org/pdf/2406.14220v2.pdf","comment":"conference, This preprint is based on the following published\n conference article: Panuntun, I. A., Chen, Y.-N., Jamaluddin, I., & Tran, T.\n L. C., 2023. Evaluation of Deep Learning Semantic Segmentation for Land Cover\n Mapping on Multispectral, Hyperspectral and High Spatial Aerial Imagery. 44th\n Asian Conference on Remote Sensing, ACRS 2023. Code 198676"},{"id":"http://arxiv.org/abs/2404.19100v2","updated":"2024-07-01T16:16:34Z","published":"2024-04-29T20:43:42Z","title":"Predicting Fairness of ML Software Configurations","summary":" This paper investigates the relationships between hyperparameters of machine\nlearning and fairness. Data-driven solutions are increasingly used in critical\nsocio-technical applications where ensuring fairness is important. Rather than\nexplicitly encoding decision logic via control and data structures, the ML\ndevelopers provide input data, perform some pre-processing, choose ML\nalgorithms, and tune hyperparameters (HPs) to infer a program that encodes the\ndecision logic. Prior works report that the selection of HPs can significantly\ninfluence fairness. However, tuning HPs to find an ideal trade-off between\naccuracy, precision, and fairness has remained an expensive and tedious task.\nCan we predict fairness of HP configuration for a given dataset? Are the\npredictions robust to distribution shifts?\n We focus on group fairness notions and investigate the HP space of 5 training\nalgorithms. We first find that tree regressors and XGBoots significantly\noutperformed deep neural networks and support vector machines in accurately\npredicting the fairness of HPs. When predicting the fairness of ML\nhyperparameters under temporal distribution shift, the tree regressors\noutperforms the other algorithms with reasonable accuracy. However, the\nprecision depends on the ML training algorithm, dataset, and protected\nattributes. For example, the tree regressor model was robust for training data\nshift from 2014 to 2018 on logistic regression and discriminant analysis HPs\nwith sex as the protected attribute; but not for race and other training\nalgorithms. Our method provides a sound framework to efficiently perform\nfine-tuning of ML training algorithms and understand the relationships between\nHPs and fairness.\n","authors":["Salvador Robles Herrera","Verya Monjezi","Vladik Kreinovich","Ashutosh Trivedi","Saeid Tizpaz-Niari"],"pdf_url":"https://arxiv.org/pdf/2404.19100v2.pdf","comment":"To Appear in the 20th International Conference on Predictive Models\n and Data Analytics in Software Engineering (PROMISE'24)"},{"id":"http://arxiv.org/abs/2404.17701v2","updated":"2024-07-01T16:07:16Z","published":"2024-04-26T20:59:23Z","title":"Embedded FPGA Developments in 130nm and 28nm CMOS for Machine Learning\n in Particle Detector Readout","summary":" Embedded field programmable gate array (eFPGA) technology allows the\nimplementation of reconfigurable logic within the design of an\napplication-specific integrated circuit (ASIC). This approach offers the low\npower and efficiency of an ASIC along with the ease of FPGA configuration,\nparticularly beneficial for the use case of machine learning in the data\npipeline of next-generation collider experiments. An open-source framework\ncalled \"FABulous\" was used to design eFPGAs using 130 nm and 28 nm CMOS\ntechnology nodes, which were subsequently fabricated and verified through\ntesting. The capability of an eFPGA to act as a front-end readout chip was\nassessed using simulation of high energy particles passing through a silicon\npixel sensor. A machine learning-based classifier, designed for reduction of\nsensor data at the source, was synthesized and configured onto the eFPGA. A\nsuccessful proof-of-concept was demonstrated through reproduction of the\nexpected algorithm result on the eFPGA with perfect accuracy. Further\ndevelopment of the eFPGA technology and its application to collider detector\nreadout is discussed.\n","authors":["Julia Gonski","Aseem Gupta","Haoyi Jia","Hyunjoon Kim","Lorenzo Rota","Larry Ruckman","Angelo Dragone","Ryan Herbst"],"pdf_url":"https://arxiv.org/pdf/2404.17701v2.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2405.03672v3","updated":"2024-07-01T15:57:59Z","published":"2024-05-06T17:48:24Z","title":"Cutting through buggy adversarial example defenses: fixing 1 line of\n code breaks Sabre","summary":" Sabre is a defense to adversarial examples that was accepted at IEEE S&P\n2024. We first reveal significant flaws in the evaluation that point to clear\nsigns of gradient masking. We then show the cause of this gradient masking: a\nbug in the original evaluation code. By fixing a single line of code in the\noriginal repository, we reduce Sabre's robust accuracy to 0%. In response to\nthis, the authors modify the defense and introduce a new defense component not\ndescribed in the original paper. But this fix contains a second bug; modifying\none more line of code reduces robust accuracy to below baseline levels. After\nwe released the first version of our paper online, the authors introduced\nanother change to the defense; by commenting out one line of code during attack\nwe reduce the robust accuracy to 0% again.\n","authors":["Nicholas Carlini"],"pdf_url":"https://arxiv.org/pdf/2405.03672v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04742v2","updated":"2024-07-01T15:55:16Z","published":"2023-09-09T10:01:51Z","title":"Affine Invariant Ensemble Transform Methods to Improve Predictive\n Uncertainty in Neural Networks","summary":" We consider the problem of performing Bayesian inference for logistic\nregression using appropriate extensions of the ensemble Kalman filter. Two\ninteracting particle systems are proposed that sample from an approximate\nposterior and prove quantitative convergence rates of these interacting\nparticle systems to their mean-field limit as the number of particles tends to\ninfinity. Furthermore, we apply these techniques and examine their\neffectiveness as methods of Bayesian approximation for quantifying predictive\nuncertainty in neural networks.\n","authors":["Diksha Bhandari","Jakiw Pidstrigach","Sebastian Reich"],"pdf_url":"https://arxiv.org/pdf/2309.04742v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09100v2","updated":"2024-07-01T15:29:45Z","published":"2023-03-16T06:09:15Z","title":"Patch-Prompt Aligned Bayesian Prompt Tuning for Vision-Language Models","summary":" For downstream applications of vision-language pre-trained models, there has\nbeen significant interest in constructing effective prompts. Existing works on\nprompt engineering, which either require laborious manual designs or optimize\nthe prompt tuning as a point estimation problem, may fail to describe diverse\ncharacteristics of categories and limit their applications. We introduce a\nBayesian probabilistic resolution to prompt tuning, where the label-specific\nstochastic prompts are generated hierarchically by first sampling a latent\nvector from an underlying distribution and then employing a lightweight\ngenerative model. Importantly, we semantically regularize the tuning process by\nminimizing the statistical distance between the visual patches and linguistic\nprompts, which pushes the stochastic label representations to faithfully\ncapture diverse visual concepts, instead of overfitting the training\ncategories. We evaluate the effectiveness of our approach on four tasks:\nfew-shot image recognition, base-to-new generalization, dataset transfer\nlearning, and domain shifts. Extensive results over 15 datasets show promising\ntransferability and generalization performance of our proposed model, both\nquantitatively and qualitatively.\n","authors":["Xinyang Liu","Dongsheng Wang","Bowei Fang","Miaoge Li","Zhibin Duan","Yishi Xu","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.09100v2.pdf","comment":"Accepted by UAI 2024"},{"id":"http://arxiv.org/abs/2403.08477v3","updated":"2024-07-01T15:29:16Z","published":"2024-03-13T12:46:03Z","title":"Unleashing the Power of Meta-tuning for Few-shot Generalization Through\n Sparse Interpolated Experts","summary":" Recent successes suggest that parameter-efficient fine-tuning of foundation\nmodels as the state-of-the-art method for transfer learning in vision,\nreplacing the rich literature of alternatives such as meta-learning. In trying\nto harness the best of both worlds, meta-tuning introduces a subsequent\noptimization stage of foundation models but has so far only shown limited\nsuccess and crucially tends to underperform on out-of-distribution (OOD) tasks.\nIn this paper, we introduce Sparse MetA-Tuning (SMAT), a method inspired by\nsparse mixture-of-experts approaches and trained to isolate subsets of\npre-trained parameters automatically for meta-tuning on each task. SMAT\nsuccessfully overcomes OOD sensitivity and delivers on the promise of enhancing\nthe transfer abilities of vision foundation models beyond parameter-efficient\nfine-tuning. We establish new state-of-the-art results on a challenging\ncombination of Meta-Dataset augmented with additional OOD tasks in both\nzero-shot and gradient-based adaptation settings. In addition, we provide a\nthorough analysis of the superiority of learned over hand-designed sparsity\npatterns for sparse expert methods and the pivotal importance of the sparsity\nlevel in balancing between in-distribution and out-of-distribution\ngeneralization. Our code is publicly available.\n","authors":["Shengzhuang Chen","Jihoon Tack","Yunqiao Yang","Yee Whye Teh","Jonathan Richard Schwarz","Ying Wei"],"pdf_url":"https://arxiv.org/pdf/2403.08477v3.pdf","comment":"The Forty-first International Conference on Machine Learning, 2024"},{"id":"http://arxiv.org/abs/2406.16740v2","updated":"2024-07-01T15:27:50Z","published":"2024-06-24T15:45:37Z","title":"Learning the boundary-to-domain mapping using Lifting Product Fourier\n Neural Operators for partial differential equations","summary":" Neural operators such as the Fourier Neural Operator (FNO) have been shown to\nprovide resolution-independent deep learning models that can learn mappings\nbetween function spaces. For example, an initial condition can be mapped to the\nsolution of a partial differential equation (PDE) at a future time-step using a\nneural operator. Despite the popularity of neural operators, their use to\npredict solution functions over a domain given only data over the boundary\n(such as a spatially varying Dirichlet boundary condition) remains unexplored.\nIn this paper, we refer to such problems as boundary-to-domain problems; they\nhave a wide range of applications in areas such as fluid mechanics, solid\nmechanics, heat transfer etc. We present a novel FNO-based architecture, named\nLifting Product FNO (or LP-FNO) which can map arbitrary boundary functions\ndefined on the lower-dimensional boundary to a solution in the entire domain.\nSpecifically, two FNOs defined on the lower-dimensional boundary are lifted\ninto the higher dimensional domain using our proposed lifting product layer. We\ndemonstrate the efficacy and resolution independence of the proposed LP-FNO for\nthe 2D Poisson equation.\n","authors":["Aditya Kashi","Arka Daw","Muralikrishnan Gopalakrishnan Meena","Hao Lu"],"pdf_url":"https://arxiv.org/pdf/2406.16740v2.pdf","comment":"Accepted by ICML 2024 AI for Science Workshop"},{"id":"http://arxiv.org/abs/2209.13694v3","updated":"2024-07-01T15:26:27Z","published":"2022-09-27T21:13:32Z","title":"Safe Linear Bandits over Unknown Polytopes","summary":" The safe linear bandit problem (SLB) is an online approach to linear\nprogramming with unknown objective and unknown roundwise constraints, under\nstochastic bandit feedback of rewards and safety risks of actions. We study the\ntradeoffs between efficacy and smooth safety costs of SLBs over polytopes, and\nthe role of aggressive doubly-optimistic play in avoiding the strong\nassumptions made by extant pessimistic-optimistic approaches.\n We first elucidate an inherent hardness in SLBs due the lack of knowledge of\nconstraints: there exist `easy' instances, for which suboptimal extreme points\nhave large `gaps', but on which SLB methods must still incur $\\Omega(\\sqrt{T})$\nregret or safety violations, due to an inability to resolve unknown optima to\narbitrary precision. We then analyse a natural doubly-optimistic strategy for\nthe safe linear bandit problem, DOSS, which uses optimistic estimates of both\nreward and safety risks to select actions, and show that despite the lack of\nknowledge of constraints or feasible points, DOSS simultaneously obtains tight\ninstance-dependent $O(\\log^2 T)$ bounds on efficacy regret, and $\\tilde\nO(\\sqrt{T})$ bounds on safety violations. Further, when safety is demanded to a\nfinite precision, violations improve to $O(\\log^2 T).$ These results rely on a\nnovel dual analysis of linear bandits: we argue that \\algoname proceeds by\nactivating noisy versions of at least $d$ constraints in each round, which\nallows us to separately analyse rounds where a `poor' set of constraints is\nactivated, and rounds where `good' sets of constraints are activated. The costs\nin the former are controlled to $O(\\log^2 T)$ by developing new dual notions of\ngaps, based on global sensitivity analyses of linear programs, that quantify\nthe suboptimality of each such set of constraints. The latter costs are\ncontrolled to $O(1)$ by explicitly analysing the solutions of optimistic play.\n","authors":["Aditya Gangrade","Tianrui Chen","Venkatesh Saligrama"],"pdf_url":"https://arxiv.org/pdf/2209.13694v3.pdf","comment":"v3: Presented at COLT 2024"},{"id":"http://arxiv.org/abs/2401.12070v2","updated":"2024-07-01T15:17:10Z","published":"2024-01-22T16:09:47Z","title":"Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated\n Text","summary":" Detecting text generated by modern large language models is thought to be\nhard, as both LLMs and humans can exhibit a wide range of complex behaviors.\nHowever, we find that a score based on contrasting two closely related language\nmodels is highly accurate at separating human-generated and machine-generated\ntext. Based on this mechanism, we propose a novel LLM detector that only\nrequires simple calculations using a pair of pre-trained LLMs. The method,\ncalled Binoculars, achieves state-of-the-art accuracy without any training\ndata. It is capable of spotting machine text from a range of modern LLMs\nwithout any model-specific modifications. We comprehensively evaluate\nBinoculars on a number of text sources and in varied situations. Over a wide\nrange of document types, Binoculars detects over 90% of generated samples from\nChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being\ntrained on any ChatGPT data.\n","authors":["Abhimanyu Hans","Avi Schwarzschild","Valeriia Cherepanova","Hamid Kazemi","Aniruddha Saha","Micah Goldblum","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2401.12070v2.pdf","comment":"20 pages, code available at https://github.com/ahans30/Binoculars"},{"id":"http://arxiv.org/abs/2301.13088v3","updated":"2024-07-01T14:48:19Z","published":"2023-01-30T17:27:12Z","title":"Stationary Kernels and Gaussian Processes on Lie Groups and their\n Homogeneous Spaces II: non-compact symmetric spaces","summary":" Gaussian processes are arguably the most important class of spatiotemporal\nmodels within machine learning. They encode prior information about the modeled\nfunction and can be used for exact or approximate Bayesian learning. In many\napplications, particularly in physical sciences and engineering, but also in\nareas such as geostatistics and neuroscience, invariance to symmetries is one\nof the most fundamental forms of prior information one can consider. The\ninvariance of a Gaussian process' covariance to such symmetries gives rise to\nthe most natural generalization of the concept of stationarity to such spaces.\nIn this work, we develop constructive and practical techniques for building\nstationary Gaussian processes on a very large class of non-Euclidean spaces\narising in the context of symmetries. Our techniques make it possible to (i)\ncalculate covariance kernels and (ii) sample from prior and posterior Gaussian\nprocesses defined on such spaces, both in a practical manner. This work is\nsplit into two parts, each involving different technical considerations: part I\nstudies compact spaces, while part II studies non-compact spaces possessing\ncertain structure. Our contributions make the non-Euclidean Gaussian process\nmodels we study compatible with well-understood computational techniques\navailable in standard Gaussian process software packages, thereby making them\naccessible to practitioners.\n","authors":["Iskander Azangulov","Andrei Smolensky","Alexander Terenin","Viacheslav Borovitskiy"],"pdf_url":"https://arxiv.org/pdf/2301.13088v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07344v2","updated":"2024-07-01T14:47:32Z","published":"2023-07-14T13:47:05Z","title":"Inverse Evolution Layers: Physics-informed Regularizers for Deep Neural\n Networks","summary":" Traditional image processing methods employing partial differential equations\n(PDEs) offer a multitude of meaningful regularizers, along with valuable\ntheoretical foundations for a wide range of image-related tasks. This makes\ntheir integration into neural networks a promising avenue. In this paper, we\nintroduce a novel regularization approach inspired by the reverse process of\nPDE-based evolution models. Specifically, we propose inverse evolution layers\n(IELs), which serve as bad property amplifiers to penalize neural networks of\nwhich outputs have undesired characteristics. Using IELs, one can achieve\nspecific regularization objectives and endow neural networks' outputs with\ncorresponding properties of the PDE models. Our experiments, focusing on\nsemantic segmentation tasks using heat-diffusion IELs, demonstrate their\neffectiveness in mitigating noisy label effects. Additionally, we develop\ncurve-motion IELs to enforce convex shape regularization in neural\nnetwork-based segmentation models for preventing the generation of concave\noutputs. Theoretical analysis confirms the efficacy of IELs as an effective\nregularization mechanism, particularly in handling training with label issues.\n","authors":["Chaoyu Liu","Zhonghua Qiao","Chao Li","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2307.07344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19440v3","updated":"2024-07-01T14:43:51Z","published":"2024-05-29T18:36:59Z","title":"On the Convergence of Multi-objective Optimization under Generalized\n Smoothness","summary":" Multi-objective optimization (MOO) is receiving more attention in various\nfields such as multi-task learning. Recent works provide some effective\nalgorithms with theoretical analysis but they are limited by the standard\n$L$-smooth or bounded-gradient assumptions, which are typically unsatisfactory\nfor neural networks, such as recurrent neural networks (RNNs) and transformers.\nIn this paper, we study a more general and realistic class of $\\ell$-smooth\nloss functions, where $\\ell$ is a general non-decreasing function of gradient\nnorm. We develop two novel single-loop algorithms for $\\ell$-smooth MOO\nproblems, Generalized Smooth Multi-objective Gradient descent (GSMGrad) and its\nstochastic variant, Stochastic Generalized Smooth Multi-objective Gradient\ndescent (SGSMGrad), which approximate the conflict-avoidant (CA) direction that\nmaximizes the minimum improvement among objectives. We provide a comprehensive\nconvergence analysis of both algorithms and show that they converge to an\n$\\epsilon$-accurate Pareto stationary point with a guaranteed $\\epsilon$-level\naverage CA distance (i.e., the gap between the updating direction and the CA\ndirection) over all iterations, where totally $\\mathcal{O}(\\epsilon^{-2})$ and\n$\\mathcal{O}(\\epsilon^{-4})$ samples are needed for deterministic and\nstochastic settings, respectively. Our algorithms can also guarantee a tighter\n$\\epsilon$-level CA distance in each iteration using more samples. Moreover, we\npropose a practical variant of GSMGrad named GSMGrad-FA using only\nconstant-level time and space, while achieving the same performance guarantee\nas GSMGrad. Our experiments validate our theory and demonstrate the\neffectiveness of the proposed methods.\n","authors":["Qi Zhang","Peiyao Xiao","Kaiyi Ji","Shaofeng Zou"],"pdf_url":"https://arxiv.org/pdf/2405.19440v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15146v2","updated":"2024-07-01T14:43:11Z","published":"2024-04-23T15:49:37Z","title":"Rethinking LLM Memorization through the Lens of Adversarial Compression","summary":" Large language models (LLMs) trained on web-scale datasets raise substantial\nconcerns regarding permissible data usage. One major question is whether these\nmodels \"memorize\" all their training data or they integrate many data sources\nin some way more akin to how a human would learn and synthesize information.\nThe answer hinges, to a large degree, on how we define memorization. In this\nwork, we propose the Adversarial Compression Ratio (ACR) as a metric for\nassessing memorization in LLMs. A given string from the training data is\nconsidered memorized if it can be elicited by a prompt (much) shorter than the\nstring itself -- in other words, if these strings can be \"compressed\" with the\nmodel by computing adversarial prompts of fewer tokens. The ACR overcomes the\nlimitations of existing notions of memorization by (i) offering an adversarial\nview of measuring memorization, especially for monitoring unlearning and\ncompliance; and (ii) allowing for the flexibility to measure memorization for\narbitrary strings at a reasonably low compute. Our definition serves as a\npractical tool for determining when model owners may be violating terms around\ndata usage, providing a potential legal tool and a critical lens through which\nto address such scenarios.\n","authors":["Avi Schwarzschild","Zhili Feng","Pratyush Maini","Zachary C. Lipton","J. Zico Kolter"],"pdf_url":"https://arxiv.org/pdf/2404.15146v2.pdf","comment":"https://locuslab.github.io/acr-memorization"},{"id":"http://arxiv.org/abs/2310.11439v3","updated":"2024-07-01T14:39:54Z","published":"2023-10-17T17:50:22Z","title":"From Alexnet to Transformers: Measuring the Non-linearity of Deep Neural\n Networks with Affine Optimal Transport","summary":" In the last decade, we have witnessed the introduction of several novel deep\nneural network (DNN) architectures exhibiting ever-increasing performance\nacross diverse tasks. Explaining the upward trend of their performance,\nhowever, remains difficult as different DNN architectures of comparable depth\nand width -- common factors associated with their expressive power -- may\nexhibit a drastically different performance even when trained on the same\ndataset. In this paper, we introduce the concept of the non-linearity signature\nof DNN, the first theoretically sound solution for approximately measuring the\nnon-linearity of deep neural networks. Built upon a score derived from\nclosed-form optimal transport mappings, this signature provides a better\nunderstanding of the inner workings of a wide range of DNN architectures and\nlearning paradigms, with a particular emphasis on the computer vision task. We\nprovide extensive experimental results that highlight the practical usefulness\nof the proposed non-linearity signature and its potential for long-reaching\nimplications. The code for our work is available at\nhttps://github.com/qbouniot/AffScoreDeep\n","authors":["Quentin Bouniot","Ievgen Redko","Anton Mallasto","Charlotte Laclau","Karol Arndt","Oliver Struckmeier","Markus Heinonen","Ville Kyrki","Samuel Kaski"],"pdf_url":"https://arxiv.org/pdf/2310.11439v3.pdf","comment":"Code available at https://github.com/qbouniot/AffScoreDeep"},{"id":"http://arxiv.org/abs/2405.11464v2","updated":"2024-07-01T14:27:51Z","published":"2024-05-19T06:43:12Z","title":"Efficient Prompt Tuning by Multi-Space Projection and Prompt Fusion","summary":" Prompt tuning is a promising method to fine-tune a pre-trained language model\nwithout retraining its large-scale parameters. Instead, it attaches a soft\nprompt to the input text, whereby downstream tasks can be well adapted by\nmerely learning the embeddings of prompt tokens. Nevertheless, existing methods\nstill suffer from two challenges: (i) they are hard to balance accuracy and\nefficiency. A longer (shorter) soft prompt generally leads to a better(worse)\naccuracy but at the cost of more (less) training time. (ii)The performance may\nnot be consistent when adapting to different downstream tasks. We attribute it\nto the same embedding space but responsible for different requirements of\ndownstream tasks. To address these issues, we propose an Efficient Prompt\nTuning method (EPT) by multi-space projection and prompt fusion. Specifically,\nit decomposes a given soft prompt into a shorter prompt and two low-rank\nmatrices, significantly reducing the training time. Accuracy is also enhanced\nby leveraging low-rank matrices and the short prompt as additional knowledge\nsources to enrich the semantics of the original short prompt. In addition, we\nproject the soft prompt into multiple subspaces to improve the performance\nconsistency, and then adaptively learn the combination weights of different\nspaces through a gating network. Experiments on 13 natural language processing\ndownstream tasks show that our method significantly and consistently\noutperforms 11 comparison methods with the relative percentage of improvements\nup to 12.9%, and training time decreased by 14%.\n","authors":["Pengxiang Lan","Enneng Yang","Yuting Liu","Guibing Guo","Linying Jiang","Jianzhe Zhao","Xingwei Wang"],"pdf_url":"https://arxiv.org/pdf/2405.11464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00541v2","updated":"2024-07-01T14:26:49Z","published":"2023-06-01T10:51:12Z","title":"Decomposing Global Feature Effects Based on Feature Interactions","summary":" Global feature effect methods, such as partial dependence plots, provide an\nintelligible visualization of the expected marginal feature effect. However,\nsuch global feature effect methods can be misleading, as they do not represent\nlocal feature effects of single observations well when feature interactions are\npresent. We formally introduce generalized additive decomposition of global\neffects (GADGET), which is a new framework based on recursive partitioning to\nfind interpretable regions in the feature space such that the\ninteraction-related heterogeneity of local feature effects is minimized. We\nprovide a mathematical foundation of the framework and show that it is\napplicable to the most popular methods to visualize marginal feature effects,\nnamely partial dependence, accumulated local effects, and Shapley additive\nexplanations (SHAP) dependence. Furthermore, we introduce and validate a new\npermutation-based interaction test to detect significant feature interactions\nthat is applicable to any feature effect method that fits into our proposed\nframework. We empirically evaluate the theoretical characteristics of the\nproposed methods based on various feature effect methods in different\nexperimental settings. Moreover, we apply our introduced methodology to three\nreal-world examples to showcase their usefulness.\n","authors":["Julia Herbinger","Marvin N. Wright","Thomas Nagler","Bernd Bischl","Giuseppe Casalicchio"],"pdf_url":"https://arxiv.org/pdf/2306.00541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02212v2","updated":"2024-07-01T14:07:58Z","published":"2023-02-04T17:53:55Z","title":"Federated Temporal Difference Learning with Linear Function\n Approximation under Environmental Heterogeneity","summary":" We initiate the study of federated reinforcement learning under environmental\nheterogeneity by considering a policy evaluation problem. Our setup involves\n$N$ agents interacting with environments that share the same state and action\nspace but differ in their reward functions and state transition kernels.\nAssuming agents can communicate via a central server, we ask: Does exchanging\ninformation expedite the process of evaluating a common policy? To answer this\nquestion, we provide the first comprehensive finite-time analysis of a\nfederated temporal difference (TD) learning algorithm with linear function\napproximation, while accounting for Markovian sampling, heterogeneity in the\nagents' environments, and multiple local updates to save communication. Our\nanalysis crucially relies on several novel ingredients: (i) deriving\nperturbation bounds on TD fixed points as a function of the heterogeneity in\nthe agents' underlying Markov decision processes (MDPs); (ii) introducing a\nvirtual MDP to closely approximate the dynamics of the federated TD algorithm;\nand (iii) using the virtual MDP to make explicit connections to federated\noptimization. Putting these pieces together, we rigorously prove that in a\nlow-heterogeneity regime, exchanging model estimates leads to linear\nconvergence speedups in the number of agents.\n","authors":["Han Wang","Aritra Mitra","Hamed Hassani","George J. Pappas","James Anderson"],"pdf_url":"https://arxiv.org/pdf/2302.02212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19963v2","updated":"2024-07-01T14:05:22Z","published":"2024-06-28T14:51:01Z","title":"Text2Robot: Evolutionary Robot Design from Text Descriptions","summary":" Robot design has traditionally been costly and labor-intensive. Despite\nadvancements in automated processes, it remains challenging to navigate a vast\ndesign space while producing physically manufacturable robots. We introduce\nText2Robot, a framework that converts user text specifications and performance\npreferences into physical quadrupedal robots. Within minutes, Text2Robot can\nuse text-to-3D models to provide strong initializations of diverse\nmorphologies. Within a day, our geometric processing algorithms and\nbody-control co-optimization produce a walking robot by explicitly considering\nreal-world electronics and manufacturability. Text2Robot enables rapid\nprototyping and opens new opportunities for robot design with generative\nmodels.\n","authors":["Ryan P. Ringel","Zachary S. Charlick","Jiaxun Liu","Boxi Xia","Boyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2406.19963v2.pdf","comment":"Our project website is at: http://generalroboticslab.com/Text2Robot"},{"id":"http://arxiv.org/abs/2310.12806v2","updated":"2024-07-01T14:04:12Z","published":"2023-10-19T15:01:57Z","title":"DCSI -- An improved measure of cluster separability based on separation\n and connectedness","summary":" Whether class labels in a given data set correspond to meaningful clusters is\ncrucial for the evaluation of clustering algorithms using real-world data sets.\nThis property can be quantified by separability measures. The central aspects\nof separability for density-based clustering are between-class separation and\nwithin-class connectedness, and neither classification-based complexity\nmeasures nor cluster validity indices (CVIs) adequately incorporate them. A\nnewly developed measure (density cluster separability index, DCSI) aims to\nquantify these two characteristics and can also be used as a CVI. Extensive\nexperiments on synthetic data indicate that DCSI correlates strongly with the\nperformance of DBSCAN measured via the adjusted Rand index (ARI) but lacks\nrobustness when it comes to multi-class data sets with overlapping classes that\nare ill-suited for density-based hard clustering. Detailed evaluation on\nfrequently used real-world data sets shows that DCSI can correctly identify\ntouching or overlapping classes that do not correspond to meaningful\ndensity-based clusters.\n","authors":["Jana Gauss","Fabian Scheipl","Moritz Herrmann"],"pdf_url":"https://arxiv.org/pdf/2310.12806v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05097v3","updated":"2024-07-01T13:46:06Z","published":"2024-05-08T14:49:27Z","title":"Biology-inspired joint distribution neurons based on Hierarchical\n Correlation Reconstruction allowing for multidirectional neural networks","summary":" Biological neural networks seem qualitatively superior (e.g. in learning,\nflexibility, robustness) from current artificial like Multi-Layer Perceptron\n(MLP) or Kolmogorov-Arnold Network (KAN). Simultaneously, in contrast to them:\nhave fundamentally multidirectional signal propagation~\\cite{axon}, also of\nprobability distributions e.g. for uncertainty estimation, and are believed not\nbeing able to use standard backpropagation training~\\cite{backprop}. There are\nproposed novel artificial neurons based on HCR (Hierarchical Correlation\nReconstruction) removing the above low level differences: with neurons\ncontaining local joint distribution model (of its connections), representing\njoint density on normalized variables as just linear combination among\n$(f_\\mathbf{j})$ orthonormal polynomials: $\\rho(\\mathbf{x})=\\sum_{\\mathbf{j}\\in\nB} a_\\mathbf{j} f_\\mathbf{j}(\\mathbf{x})$ for $\\mathbf{x} \\in [0,1]^d$ and $B$\nsome chosen basis, with basis growth approaching complete description of joint\ndistribution. By various index summations of such $(a_\\mathbf{j})$ tensor as\nneuron parameters, we get simple formulas for e.g. conditional expected values\nfor propagation in any direction, like $E[x|y,z]$, $E[y|x]$, which degenerate\nto KAN-like parametrization if restricting to pairwise dependencies. Such HCR\nnetwork can also propagate probability distributions (also joint) like\n$\\rho(y,z|x)$. It also allows for additional training approaches, like direct\n$(a_\\mathbf{j})$ estimation, through tensor decomposition, or more biologically\nplausible information bottleneck training: layers directly influencing only\nneighbors, optimizing content to maximize information about the next layer, and\nminimizing about the previous to minimize the noise.\n","authors":["Jarek Duda"],"pdf_url":"https://arxiv.org/pdf/2405.05097v3.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.09976v2","updated":"2024-07-01T13:35:44Z","published":"2024-06-14T12:37:08Z","title":"Robust Model-Based Reinforcement Learning with an Adversarial Auxiliary\n Model","summary":" Reinforcement learning has demonstrated impressive performance in various\nchallenging problems such as robotics, board games, and classical arcade games.\nHowever, its real-world applications can be hindered by the absence of\nrobustness and safety in the learned policies. More specifically, an RL agent\nthat trains in a certain Markov decision process (MDP) often struggles to\nperform well in nearly identical MDPs. To address this issue, we employ the\nframework of Robust MDPs (RMDPs) in a model-based setting and introduce a novel\nlearned transition model. Our method specifically incorporates an auxiliary\npessimistic model, updated adversarially, to estimate the worst-case MDP within\na Kullback-Leibler uncertainty set. In comparison to several existing works,\nour work does not impose any additional conditions on the training environment,\nsuch as the need for a parametric simulator. To test the effectiveness of the\nproposed pessimistic model in enhancing policy robustness, we integrate it into\na practical RL algorithm, called Robust Model-Based Policy Optimization\n(RMBPO). Our experimental results indicate a notable improvement in policy\nrobustness on high-dimensional MuJoCo control tasks, with the auxiliary model\nenhancing the performance of the learned policy in distorted MDPs. We further\nexplore the learned deviation between the proposed auxiliary world model and\nthe nominal model, to examine how pessimism is achieved. By learning a\npessimistic world model and demonstrating its role in improving policy\nrobustness, our research contributes towards making (model-based) RL more\nrobust.\n","authors":["Siemen Herremans","Ali Anwar","Siegfried Mercelis"],"pdf_url":"https://arxiv.org/pdf/2406.09976v2.pdf","comment":"Will be presented at the RL Safety Workshop at RLC 2024"},{"id":"http://arxiv.org/abs/2312.08489v3","updated":"2024-07-01T13:24:51Z","published":"2023-12-13T20:08:41Z","title":"Connectivity Oracles for Predictable Vertex Failures","summary":" The problem of designing connectivity oracles supporting vertex failures is\none of the basic data structures problems for undirected graphs. It is already\nwell understood: previous works [Duan--Pettie STOC'10; Long--Saranurak FOCS'22]\nachieve query time linear in the number of failed vertices, and it is\nconditionally optimal as long as we require preprocessing time polynomial in\nthe size of the graph and update time polynomial in the number of failed\nvertices.\n We revisit this problem in the paradigm of algorithms with predictions: we\nask if the query time can be improved if the set of failed vertices can be\npredicted beforehand up to a small number of errors. More specifically, we\ndesign a data structure that, given a graph $G=(V,E)$ and a set of vertices\npredicted to fail $\\widehat{D} \\subseteq V$ of size $d=|\\widehat{D}|$,\npreprocesses it in time $\\tilde{O}(d|E|)$ and then can receive an update given\nas the symmetric difference between the predicted and the actual set of failed\nvertices $\\widehat{D} \\triangle D = (\\widehat{D} \\setminus D) \\cup (D \\setminus\n\\widehat{D})$ of size $\\eta = |\\widehat{D} \\triangle D|$, process it in time\n$\\tilde{O}(\\eta^4)$, and after that answer connectivity queries in $G \\setminus\nD$ in time $O(\\eta)$. Viewed from another perspective, our data structure\nprovides an improvement over the state of the art for the \\emph{fully dynamic\nsubgraph connectivity problem} in the \\emph{sensitivity setting}\n[Henzinger--Neumann ESA'16].\n We argue that the preprocessing time and query time of our data structure are\nconditionally optimal under standard fine-grained complexity assumptions.\n","authors":["Bingbing Hu","Evangelos Kosinas","Adam Polak"],"pdf_url":"https://arxiv.org/pdf/2312.08489v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07866v5","updated":"2024-07-01T13:17:32Z","published":"2022-11-15T03:17:11Z","title":"Efficient Estimation for Longitudinal Networks via Adaptive Merging","summary":" Longitudinal network consists of a sequence of temporal edges among multiple\nnodes, where the temporal edges are observed in real time. It has become\nubiquitous with the rise of online social platform and e-commerce, but largely\nunder-investigated in literature. In this paper, we propose an efficient\nestimation framework for longitudinal network, leveraging strengths of adaptive\nnetwork merging, tensor decomposition and point process. It merges neighboring\nsparse networks so as to enlarge the number of observed edges and reduce\nestimation variance, whereas the estimation bias introduced by network merging\nis controlled by exploiting local temporal structures for adaptive network\nneighborhood. A projected gradient descent algorithm is proposed to facilitate\nestimation, where the upper bound of the estimation error in each iteration is\nestablished. A thorough analysis is conducted to quantify the asymptotic\nbehavior of the proposed method, which shows that it can significantly reduce\nthe estimation error and also provides guideline for network merging under\nvarious scenarios. We further demonstrate the advantage of the proposed method\nthrough extensive numerical experiments on synthetic datasets and a militarized\ninterstate dispute dataset.\n","authors":["Haoran Zhang","Junhui Wang"],"pdf_url":"https://arxiv.org/pdf/2211.07866v5.pdf","comment":"30 pages and 4 figures; appendix including technical proof will be\n uploaded later"},{"id":"http://arxiv.org/abs/2404.06371v2","updated":"2024-07-01T13:16:49Z","published":"2024-04-09T15:07:25Z","title":"Model Generation with LLMs: From Requirements to UML Sequence Diagrams","summary":" Complementing natural language (NL) requirements with graphical models can\nimprove stakeholders' communication and provide directions for system design.\nHowever, creating models from requirements involves manual effort. The advent\nof generative large language models (LLMs), ChatGPT being a notable example,\noffers promising avenues for automated assistance in model generation. This\npaper investigates the capability of ChatGPT to generate a specific type of\nmodel, i.e., UML sequence diagrams, from NL requirements. We conduct a\nqualitative study in which we examine the sequence diagrams generated by\nChatGPT for 28 requirements documents of various types and from different\ndomains. Observations from the analysis of the generated diagrams have\nsystematically been captured through evaluation logs, and categorized through\nthematic analysis. Our results indicate that, although the models generally\nconform to the standard and exhibit a reasonable level of understandability,\ntheir completeness and correctness with respect to the specified requirements\noften present challenges. This issue is particularly pronounced in the presence\nof requirements smells, such as ambiguity and inconsistency. The insights\nderived from this study can influence the practical utilization of LLMs in the\nRE process, and open the door to novel RE-specific prompting strategies\ntargeting effective model generation.\n","authors":["Alessio Ferrari","Sallam Abualhaija","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2404.06371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10208v2","updated":"2024-07-01T12:48:51Z","published":"2024-02-15T18:59:02Z","title":"Recovering the Pre-Fine-Tuning Weights of Generative Models","summary":" The dominant paradigm in generative modeling consists of two steps: i)\npre-training on a large-scale but unsafe dataset, ii) aligning the pre-trained\nmodel with human values via fine-tuning. This practice is considered safe, as\nno current method can recover the unsafe, pre-fine-tuning model weights. In\nthis paper, we demonstrate that this assumption is often false. Concretely, we\npresent Spectral DeTuning, a method that can recover the weights of the\npre-fine-tuning model using a few low-rank (LoRA) fine-tuned models. In\ncontrast to previous attacks that attempt to recover pre-fine-tuning\ncapabilities, our method aims to recover the exact pre-fine-tuning weights. Our\napproach exploits this new vulnerability against large-scale models such as a\npersonalized Stable Diffusion and an aligned Mistral.\n","authors":["Eliahu Horwitz","Jonathan Kahana","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2402.10208v2.pdf","comment":"ICML 2024. Project page: https://vision.huji.ac.il/spectral_detuning/"},{"id":"http://arxiv.org/abs/2307.04033v3","updated":"2024-07-01T12:46:35Z","published":"2023-07-08T18:58:08Z","title":"Probabilistic Test-Time Generalization by Variational Neighbor-Labeling","summary":" This paper strives for domain generalization, where models are trained\nexclusively on source domains before being deployed on unseen target domains.\nWe follow the strict separation of source training and target testing, but\nexploit the value of the unlabeled target data itself during inference. We make\nthree contributions. First, we propose probabilistic pseudo-labeling of target\nsamples to generalize the source-trained model to the target domain at test\ntime. We formulate the generalization at test time as a variational inference\nproblem, by modeling pseudo labels as distributions, to consider the\nuncertainty during generalization and alleviate the misleading signal of\ninaccurate pseudo labels. Second, we learn variational neighbor labels that\nincorporate the information of neighboring target samples to generate more\nrobust pseudo labels. Third, to learn the ability to incorporate more\nrepresentative target information and generate more precise and robust\nvariational neighbor labels, we introduce a meta-generalization stage during\ntraining to simulate the generalization procedure. Experiments on seven\nwidely-used datasets demonstrate the benefits, abilities, and effectiveness of\nour proposal.\n","authors":["Sameer Ambekar","Zehao Xiao","Jiayi Shen","Xiantong Zhen","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2307.04033v3.pdf","comment":"Accepted by CoLLAs 2024"},{"id":"http://arxiv.org/abs/2406.13663v2","updated":"2024-07-01T12:39:26Z","published":"2024-06-19T16:10:26Z","title":"Model Internals-based Answer Attribution for Trustworthy\n Retrieval-Augmented Generation","summary":" Ensuring the verifiability of model answers is a fundamental challenge for\nretrieval-augmented generation (RAG) in the question answering (QA) domain.\nRecently, self-citation prompting was proposed to make large language models\n(LLMs) generate citations to supporting documents along with their answers.\nHowever, self-citing LLMs often struggle to match the required format, refer to\nnon-existent sources, and fail to faithfully reflect LLMs' context usage\nthroughout the generation. In this work, we present MIRAGE --Model\nInternals-based RAG Explanations -- a plug-and-play approach using model\ninternals for faithful answer attribution in RAG applications. MIRAGE detects\ncontext-sensitive answer tokens and pairs them with retrieved documents\ncontributing to their prediction via saliency methods. We evaluate our proposed\napproach on a multilingual extractive QA dataset, finding high agreement with\nhuman answer attribution. On open-ended QA, MIRAGE achieves citation quality\nand efficiency comparable to self-citation while also allowing for a\nfiner-grained control of attribution parameters. Our qualitative evaluation\nhighlights the faithfulness of MIRAGE's attributions and underscores the\npromising application of model internals for RAG answer attribution.\n","authors":["Jirui Qi","Gabriele Sarti","Raquel Fernández","Arianna Bisazza"],"pdf_url":"https://arxiv.org/pdf/2406.13663v2.pdf","comment":"Under review. Code and data released at\n https://github.com/Betswish/MIRAGE"},{"id":"http://arxiv.org/abs/2310.14992v3","updated":"2024-07-01T12:36:03Z","published":"2023-10-23T14:45:51Z","title":"Bayesian Regression Markets","summary":" Although machine learning tasks are highly sensitive to the quality of input\ndata, relevant datasets can often be challenging for firms to acquire,\nespecially when held privately by a variety of owners. For instance, if these\nowners are competitors in a downstream market, they may be reluctant to share\ninformation. Focusing on supervised learning for regression tasks, we develop a\nregression market to provide a monetary incentive for data sharing. Our\nmechanism adopts a Bayesian framework, allowing us to consider a more general\nclass of regression tasks. We present a thorough exploration of the market\nproperties, and show that similar proposals in literature expose the market\nagents to sizeable financial risks, which can be mitigated in our setup.\n","authors":["Thomas Falconer","Jalal Kazempour","Pierre Pinson"],"pdf_url":"https://arxiv.org/pdf/2310.14992v3.pdf","comment":"35 pages, 11 figures, 3 tables. Published in Journal of Machine\n Learning Research (2024)"},{"id":"http://arxiv.org/abs/2312.13327v6","updated":"2024-07-01T12:29:58Z","published":"2023-12-20T16:58:55Z","title":"In-Context Reinforcement Learning for Variable Action Spaces","summary":" Recently, it has been shown that transformers pre-trained on diverse datasets\nwith multi-episode contexts can generalize to new reinforcement learning tasks\nin-context. A key limitation of previously proposed models is their reliance on\na predefined action space size and structure. The introduction of a new action\nspace often requires data re-collection and model re-training, which can be\ncostly for some applications. In our work, we show that it is possible to\nmitigate this issue by proposing the Headless-AD model that, despite being\ntrained only once, is capable of generalizing to discrete action spaces of\nvariable size, semantic content and order. By experimenting with Bernoulli and\ncontextual bandits, as well as a gridworld environment, we show that\nHeadless-AD exhibits significant capability to generalize to action spaces it\nhas never encountered, even outperforming specialized models trained for a\nspecific set of actions on several environment configurations. Implementation\nis available at: https://github.com/corl-team/headless-ad.\n","authors":["Viacheslav Sinii","Alexander Nikulin","Vladislav Kurenkov","Ilya Zisman","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2312.13327v6.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2401.17780v3","updated":"2024-07-01T12:08:25Z","published":"2024-01-31T12:23:24Z","title":"A Policy Gradient Primal-Dual Algorithm for Constrained MDPs with\n Uniform PAC Guarantees","summary":" We study a primal-dual (PD) reinforcement learning (RL) algorithm for online\nconstrained Markov decision processes (CMDPs). Despite its widespread practical\nuse, the existing theoretical literature on PD-RL algorithms for this problem\nonly provides sublinear regret guarantees and fails to ensure convergence to\noptimal policies. In this paper, we introduce a novel policy gradient PD\nalgorithm with uniform probably approximate correctness (Uniform-PAC)\nguarantees, simultaneously ensuring convergence to optimal policies, sublinear\nregret, and polynomial sample complexity for any target accuracy. Notably, this\nrepresents the first Uniform-PAC algorithm for the online CMDP problem. In\naddition to the theoretical guarantees, we empirically demonstrate in a simple\nCMDP that our algorithm converges to optimal policies, while baseline\nalgorithms exhibit oscillatory performance and constraint violation.\n","authors":["Toshinori Kitamura","Tadashi Kozuno","Masahiro Kato","Yuki Ichihara","Soichiro Nishimori","Akiyoshi Sannai","Sho Sonoda","Wataru Kumagai","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2401.17780v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18624v2","updated":"2024-07-01T12:07:16Z","published":"2024-06-26T12:50:55Z","title":"Robust Low-Cost Drone Detection and Classification in Low SNR\n Environments","summary":" The proliferation of drones, or unmanned aerial vehicles (UAVs), has raised\nsignificant safety concerns due to their potential misuse in activities such as\nespionage, smuggling, and infrastructure disruption. This paper addresses the\ncritical need for effective drone detection and classification systems that\noperate independently of UAV cooperation. We evaluate various convolutional\nneural networks (CNNs) for their ability to detect and classify drones using\nspectrogram data derived from consecutive Fourier transforms of signal\ncomponents. The focus is on model robustness in low signal-to-noise ratio (SNR)\nenvironments, which is critical for real-world applications. A comprehensive\ndataset is provided to support future model development. In addition, we\ndemonstrate a low-cost drone detection system using a standard computer,\nsoftware-defined radio (SDR) and antenna, validated through real-world field\ntesting. On our development dataset, all models consistently achieved an\naverage balanced classification accuracy of >= 85% at SNR > -12dB. In the field\ntest, these models achieved an average balance accuracy of > 80%, depending on\ntransmitter distance and antenna direction. Our contributions include: a\npublicly available dataset for model development, a comparative analysis of CNN\nfor drone detection under low SNR conditions, and the deployment and field\nevaluation of a practical, low-cost detection system.\n","authors":["Stefan Glüge","Matthias Nyfeler","Ahmad Aghaebrahimian","Nicola Ramagnano","Christof Schüpbach"],"pdf_url":"https://arxiv.org/pdf/2406.18624v2.pdf","comment":"10 pages, submitted to IEEE Journal of Radio Frequency Identification"},{"id":"http://arxiv.org/abs/2303.13113v3","updated":"2024-07-01T11:57:06Z","published":"2023-03-23T09:00:38Z","title":"AdaCL:Adaptive Continual Learning","summary":" Class-Incremental Learning aims to update a deep classifier to learn new\ncategories while maintaining or improving its accuracy on previously observed\nclasses. Common methods to prevent forgetting previously learned classes\ninclude regularizing the neural network updates and storing exemplars in\nmemory, which come with hyperparameters such as the learning rate,\nregularization strength, or the number of exemplars. However, these\nhyperparameters are usually only tuned at the start and then kept fixed\nthroughout the learning sessions, ignoring the fact that newly encountered\ntasks may have varying levels of novelty or difficulty. This study investigates\nthe necessity of hyperparameter `adaptivity' in Class-Incremental Learning: the\nability to dynamically adjust hyperparameters such as the learning rate,\nregularization strength, and memory size according to the properties of the new\ntask at hand. We propose AdaCL, a Bayesian Optimization-based approach to\nautomatically and efficiently determine the optimal values for those parameters\nwith each learning task. We show that adapting hyperpararmeters on each new\ntask leads to improvement in accuracy, forgetting and memory. Code is available\nat https://github.com/ElifCerenGokYildirim/AdaCL.\n","authors":["Elif Ceren Gok Yildirim","Murat Onur Yildirim","Mert Kilickaya","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2303.13113v3.pdf","comment":"Published in 1st ContinualAI Unconference"},{"id":"http://arxiv.org/abs/2406.04043v2","updated":"2024-07-01T11:56:17Z","published":"2024-06-06T13:13:29Z","title":"Energy-based Epistemic Uncertainty for Graph Neural Networks","summary":" In domains with interdependent data, such as graphs, quantifying the\nepistemic uncertainty of a Graph Neural Network (GNN) is challenging as\nuncertainty can arise at different structural scales. Existing techniques\nneglect this issue or only distinguish between structure-aware and\nstructure-agnostic uncertainty without combining them into a single measure. We\npropose GEBM, an energy-based model (EBM) that provides high-quality\nuncertainty estimates by aggregating energy at different structural levels that\nnaturally arise from graph diffusion. In contrast to logit-based EBMs, we\nprovably induce an integrable density in the data space by regularizing the\nenergy function. We introduce an evidential interpretation of our EBM that\nsignificantly improves the predictive robustness of the GNN. Our framework is a\nsimple and effective post hoc method applicable to any pre-trained GNN that is\nsensitive to various distribution shifts. It consistently achieves the best\nseparation of in-distribution and out-of-distribution data on 6 out of 7\nanomaly types while having the best average rank over shifts on \\emph{all}\ndatasets.\n","authors":["Dominik Fuchsgruber","Tom Wollschläger","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2406.04043v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04679v3","updated":"2024-07-01T11:44:15Z","published":"2023-07-10T16:29:05Z","title":"Minimax Excess Risk of First-Order Methods for Statistical Learning with\n Data-Dependent Oracles","summary":" In this paper, our aim is to analyse the generalization capabilities of\nfirst-order methods for statistical learning in multiple, different yet\nrelated, scenarios including supervised learning, transfer learning, robust\nlearning and federated learning. To do so, we provide sharp upper and lower\nbounds for the minimax excess risk of strongly convex and smooth statistical\nlearning when the gradient is accessed through partial observations given by a\ndata-dependent oracle. This novel class of oracles can query the gradient with\nany given data distribution, and is thus well suited to scenarios in which the\ntraining data distribution does not match the target (or test) distribution. In\nparticular, our upper and lower bounds are proportional to the smallest mean\nsquare error achievable by gradient estimators, thus allowing us to easily\nderive multiple sharp bounds in the aforementioned scenarios using the\nextensive literature on parameter estimation.\n","authors":["Kevin Scaman","Mathieu Even","Batiste Le Bars","Laurent Massoulié"],"pdf_url":"https://arxiv.org/pdf/2307.04679v3.pdf","comment":"22 pages, 0 figures"},{"id":"http://arxiv.org/abs/2311.16442v3","updated":"2024-07-01T11:13:54Z","published":"2023-11-28T02:44:59Z","title":"Fast and Efficient 2-bit LLM Inference on GPU: 2/4/16-bit in a Weight\n Matrix with Asynchronous Dequantization","summary":" Large language models (LLMs) have demonstrated impressive abilities in\nvarious domains while the inference cost is expensive. Many previous studies\nexploit quantization methods to reduce LLM inference cost by reducing latency\nand memory consumption. Applying 2-bit single-precision weight quantization\nbrings >3% accuracy loss, so the state-of-the-art methods use mixed-precision\nmethods for LLMs (e.g. Llama2-7b, etc.) to improve the accuracy. However,\nchallenges still exist: (1) Uneven distribution in weight matrix. (2) Large\nspeed degradation by adding sparse outliers. (3) Time-consuming dequantization\noperations on GPUs. To tackle these challenges and enable fast and efficient\nLLM inference on GPUs, we propose the following techniques in this paper. (1)\nIntra-weight mixed-precision quantization. (2) Exclusive 2-bit sparse outlier\nwith minimum speed degradation. (3) Asynchronous dequantization. We conduct\nextensive experiments on different model families (e.g. Llama3, etc.) and model\nsizes. We achieve 2.91-bit for each weight considering all scales/zeros for\ndifferent models with negligible loss. As a result, with our 2/4/16\nmixed-precision quantization for each weight matrix and asynchronous\ndequantization during inference, our design achieves an end-to-end speedup for\nLlama2-7b is 1.74x over the original model, and we reduce both runtime cost and\ntotal cost by up to 2.53x and 2.29x with less GPU requirements.\n","authors":["Jinhao Li","Jiaming Xu","Shiyao Li","Shan Huang","Jun Liu","Yaoxiu Lian","Guohao Dai"],"pdf_url":"https://arxiv.org/pdf/2311.16442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02180v2","updated":"2024-07-01T11:11:29Z","published":"2024-04-02T09:15:32Z","title":"Remote sensing framework for geological mapping via stacked autoencoders\n and clustering","summary":" Supervised machine learning methods for geological mapping via remote sensing\nface limitations due to the scarcity of accurately labelled training data that\ncan be addressed by unsupervised learning, such as dimensionality reduction and\nclustering. Dimensionality reduction methods have the potential to play a\ncrucial role in improving the accuracy of geological maps. Although\nconventional dimensionality reduction methods may struggle with nonlinear data,\nunsupervised deep learning models such as autoencoders can model non-linear\nrelationships. Stacked autoencoders feature multiple interconnected layers to\ncapture hierarchical data representations useful for remote sensing data. This\nstudy presents an unsupervised machine learning-based framework for processing\nremote sensing data using stacked autoencoders for dimensionality reduction and\nk-means clustering for mapping geological units. We use Landsat 8, ASTER, and\nSentinel-2 datasets to evaluate the framework for geological mapping of the\nMutawintji region in Western New South Wales, Australia. We also compare\nstacked autoencoders with principal component analysis and canonical\nautoencoders. Our results reveal that the framework produces accurate and\ninterpretable geological maps, efficiently discriminating rock units. We find\nthat the accuracy of stacked autoencoders ranges from 86.6 % to 90 %, depending\non the remote sensing data type, which is superior to their counterparts. We\nalso find that the generated maps align with prior geological knowledge of the\nstudy area while providing novel insights into geological structures.\n","authors":["Sandeep Nagar","Ehsan Farahbakhsh","Joseph Awange","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2404.02180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06090v2","updated":"2024-07-01T11:08:14Z","published":"2023-09-12T09:37:26Z","title":"A General Verification Framework for Dynamical and Control Models via\n Certificate Synthesis","summary":" An emerging branch of control theory specialises in certificate learning,\nconcerning the specification of a desired (possibly complex) system behaviour\nfor an autonomous or control model, which is then analytically verified by\nmeans of a function-based proof. However, the synthesis of controllers abiding\nby these complex requirements is in general a non-trivial task and may elude\nthe most expert control engineers. This results in a need for automatic\ntechniques that are able to design controllers and to analyse a wide range of\nelaborate specifications. In this paper, we provide a general framework to\nencode system specifications and define corresponding certificates, and we\npresent an automated approach to formally synthesise controllers and\ncertificates. Our approach contributes to the broad field of safe learning for\ncontrol, exploiting the flexibility of neural networks to provide candidate\ncontrol and certificate functions, whilst using SMT-solvers to offer a formal\nguarantee of correctness. We test our framework by developing a prototype\nsoftware tool, and assess its efficacy at verification via control and\ncertificate synthesis over a large and varied suite of benchmarks.\n","authors":["Alec Edwards","Andrea Peruffo","Alessandro Abate"],"pdf_url":"https://arxiv.org/pdf/2309.06090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19768v2","updated":"2024-07-01T11:02:45Z","published":"2024-06-28T09:17:51Z","title":"Contextualized Hybrid Ensemble Q-learning: Learning Fast with Control\n Priors","summary":" Combining Reinforcement Learning (RL) with a prior controller can yield the\nbest out of two worlds: RL can solve complex nonlinear problems, while the\ncontrol prior ensures safer exploration and speeds up training. Prior work\nlargely blends both components with a fixed weight, neglecting that the RL\nagent's performance varies with the training progress and across regions in the\nstate space. Therefore, we advocate for an adaptive strategy that dynamically\nadjusts the weighting based on the RL agent's current capabilities. We propose\na new adaptive hybrid RL algorithm, Contextualized Hybrid Ensemble Q-learning\n(CHEQ). CHEQ combines three key ingredients: (i) a time-invariant formulation\nof the adaptive hybrid RL problem treating the adaptive weight as a context\nvariable, (ii) a weight adaption mechanism based on the parametric uncertainty\nof a critic ensemble, and (iii) ensemble-based acceleration for data-efficient\nRL. Evaluating CHEQ on a car racing task reveals substantially stronger data\nefficiency, exploration safety, and transferability to unknown scenarios than\nstate-of-the-art adaptive hybrid RL methods.\n","authors":["Emma Cramer","Bernd Frauenknecht","Ramil Sabirov","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2406.19768v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.15304v3","updated":"2024-07-01T10:48:40Z","published":"2022-10-27T10:25:51Z","title":"Explaining the Explainers in Graph Neural Networks: a Comparative Study","summary":" Following a fast initial breakthrough in graph based learning, Graph Neural\nNetworks (GNNs) have reached a widespread application in many science and\nengineering fields, prompting the need for methods to understand their decision\nprocess.\n GNN explainers have started to emerge in recent years, with a multitude of\nmethods both novel or adapted from other domains. To sort out this plethora of\nalternative approaches, several studies have benchmarked the performance of\ndifferent explainers in terms of various explainability metrics. However, these\nearlier works make no attempts at providing insights into why different GNN\narchitectures are more or less explainable, or which explainer should be\npreferred in a given setting.\n In this survey, we fill these gaps by devising a systematic experimental\nstudy, which tests ten explainers on eight representative architectures trained\non six carefully designed graph and node classification datasets. With our\nresults we provide key insights on the choice and applicability of GNN\nexplainers, we isolate key components that make them usable and successful and\nprovide recommendations on how to avoid common interpretation pitfalls. We\nconclude by highlighting open questions and directions of possible future\nresearch.\n","authors":["Antonio Longa","Steve Azzolin","Gabriele Santin","Giulia Cencetti","Pietro Liò","Bruno Lepri","Andrea Passerini"],"pdf_url":"https://arxiv.org/pdf/2210.15304v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10288v2","updated":"2024-07-01T10:17:58Z","published":"2024-06-12T18:33:11Z","title":"Mimicking User Data: On Mitigating Fine-Tuning Risks in Closed Large\n Language Models","summary":" Fine-tuning large language models on small, high-quality datasets can enhance\ntheir performance on specific downstream tasks. Recent research shows that\nfine-tuning on benign, instruction-following data can inadvertently undo the\nsafety alignment process and increase a model's propensity to comply with\nharmful queries. Although critical, understanding and mitigating safety risks\nin well-defined tasks remains distinct from the instruction-following context\ndue to structural differences in the data. Our work addresses the gap in our\nunderstanding of these risks across diverse types of data in closed models -\nwhere providers control how user data is utilized in the fine-tuning process.\nWe demonstrate how malicious actors can subtly manipulate the structure of\nalmost any task-specific dataset to foster significantly more dangerous model\nbehaviors, while maintaining an appearance of innocuity and reasonable\ndownstream task performance. To address this issue, we propose a novel\nmitigation strategy that mixes in safety data which mimics the task format and\nprompting style of the user data, showing this is more effective than existing\nbaselines at re-establishing safety alignment while maintaining similar task\nperformance.\n","authors":["Francisco Eiras","Aleksandar Petrov","Phillip H. S. Torr","M. Pawan Kumar","Adel Bibi"],"pdf_url":"https://arxiv.org/pdf/2406.10288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02331v2","updated":"2024-07-01T10:16:38Z","published":"2023-03-04T05:34:25Z","title":"Training-Free Acceleration of ViTs with Delayed Spatial Merging","summary":" Token merging has emerged as a new paradigm that can accelerate the inference\nof Vision Transformers (ViTs) without any retraining or fine-tuning. To push\nthe frontier of training-free acceleration in ViTs, we improve token merging by\nadding the perspectives of 1) activation outliers and 2) hierarchical\nrepresentations. Through a careful analysis of the attention behavior in ViTs,\nwe characterize a delayed onset of the convergent attention phenomenon, which\nmakes token merging undesirable in the bottom blocks of ViTs. Moreover, we\naugment token merging with a hierarchical processing scheme to capture\nmulti-scale redundancy between visual tokens. Combining these two insights, we\nbuild a unified inference framework called DSM: Delayed Spatial Merging. We\nextensively evaluate DSM on various ViT model scales (Tiny to Huge) and tasks\n(ImageNet-1k and transfer learning), achieving up to 1.8$\\times$ FLOP reduction\nand 1.6$\\times$ throughput speedup at a negligible loss while being two orders\nof magnitude faster than existing methods.\n","authors":["Jung Hwan Heo","Seyedarmin Azizi","Arash Fayyazi","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2303.02331v2.pdf","comment":"ICML 2024 ES-FoMo Workshop"},{"id":"http://arxiv.org/abs/2402.04858v2","updated":"2024-07-01T10:03:33Z","published":"2024-02-07T13:55:27Z","title":"CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay","summary":" Large language models are increasingly solving tasks that are commonly\nbelieved to require human-level reasoning ability. However, these models still\nperform very poorly on benchmarks of general intelligence such as the\nAbstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a\nprogramming-by-examples problem, and introduce a novel and scalable method for\nlanguage model self-improvement called Code Iteration (CodeIt). Our method\niterates between 1) program sampling and hindsight relabeling, and 2) learning\nfrom prioritized experience replay. By relabeling the goal of an episode (i.e.,\nthe target program output given input) to the realized output produced by the\nsampled program, our method effectively deals with the extreme sparsity of\nrewards in program synthesis. Applying CodeIt to the ARC dataset, we\ndemonstrate that prioritized hindsight replay, along with pre-training and\ndata-augmentation, leads to successful inter-task generalization. CodeIt is the\nfirst neuro-symbolic approach that scales to the full ARC evaluation dataset.\nOur method solves 15% of ARC evaluation tasks, achieving state-of-the-art\nperformance and outperforming existing neural and symbolic baselines. Our code\nis available at https://github.com/Qualcomm-AI-research/codeit .\n","authors":["Natasha Butt","Blazej Manczak","Auke Wiggers","Corrado Rainone","David W. Zhang","Michaël Defferrard","Taco Cohen"],"pdf_url":"https://arxiv.org/pdf/2402.04858v2.pdf","comment":"ICML'24 camera-ready version"},{"id":"http://arxiv.org/abs/2403.13583v2","updated":"2024-07-01T09:59:47Z","published":"2024-03-20T13:33:55Z","title":"CoCoST: Automatic Complex Code Generation with Online Searching and\n Correctness Testing","summary":" Large Language Models have revolutionized code generation ability by\nconverting natural language descriptions into executable code. However,\ngenerating complex code within real-world scenarios remains challenging due to\nintricate structures, subtle bugs, understanding of advanced data types, and\nlack of supplementary contents. To address these challenges, we introduce the\nCoCoST framework, which enhances complex code generation by online searching\nfor more information with planned queries and correctness testing for code\nrefinement. Moreover, CoCoST serializes the complex inputs and outputs to\nimprove comprehension and generates test cases to ensure the adaptability for\nreal-world applications. CoCoST is validated through rigorous experiments on\nthe DS-1000 and ClassEval datasets. Experimental results show that CoCoST\nsubstantially improves the quality of complex code generation, highlighting its\npotential to enhance the practicality of LLMs in generating complex code.\n","authors":["Xinyi He","Jiaru Zou","Yun Lin","Mengyu Zhou","Shi Han","Zejian Yuan","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.13583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11650v2","updated":"2024-07-01T09:57:32Z","published":"2024-06-17T15:31:54Z","title":"Multimodal Learning With Intraoperative CBCT & Variably Aligned\n Preoperative CT Data To Improve Segmentation","summary":" Cone-beam computed tomography (CBCT) is an important tool facilitating\ncomputer aided interventions, despite often suffering from artifacts that pose\nchallenges for accurate interpretation. While the degraded image quality can\naffect downstream segmentation, the availability of high quality, preoperative\nscans represents potential for improvements. Here we consider a setting where\npreoperative CT and intraoperative CBCT scans are available, however, the\nalignment (registration) between the scans is imperfect. We propose a\nmultimodal learning method that fuses roughly aligned CBCT and CT scans and\ninvestigate the effect of CBCT quality and misalignment on the final\nsegmentation performance. For that purpose, we make use of a synthetically\ngenerated data set containing real CT and synthetic CBCT volumes. As an\napplication scenario, we focus on liver and liver tumor segmentation. We show\nthat the fusion of preoperative CT and simulated, intraoperative CBCT mostly\nimproves segmentation performance (compared to using intraoperative CBCT only)\nand that even clearly misaligned preoperative data has the potential to improve\nsegmentation performance.\n","authors":["Maximilian E. Tschuchnig","Philipp Steininger","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2406.11650v2.pdf","comment":"Submitted to SASHIMI2024 (MICCAI workshop)"},{"id":"http://arxiv.org/abs/2311.13580v4","updated":"2024-07-01T09:55:40Z","published":"2023-11-22T18:34:49Z","title":"$σ$-PCA: a building block for neural learning of identifiable\n linear transformations","summary":" Linear principal component analysis (PCA) learns (semi-)orthogonal\ntransformations by orienting the axes to maximize variance. Consequently, it\ncan only identify orthogonal axes whose variances are clearly distinct, but it\ncannot identify the subsets of axes whose variances are roughly equal. It\ncannot eliminate the subspace rotational indeterminacy: it fails to disentangle\ncomponents with equal variances (eigenvalues), resulting, in each eigen\nsubspace, in randomly rotated axes. In this paper, we propose $\\sigma$-PCA, a\nmethod that (1) formulates a unified model for linear and nonlinear PCA, the\nlatter being a special case of linear independent component analysis (ICA), and\n(2) introduces a missing piece into nonlinear PCA that allows it to eliminate,\nfrom the canonical linear PCA solution, the subspace rotational indeterminacy\n-- without whitening the inputs. Whitening, a preprocessing step which converts\nthe inputs into unit-variance inputs, has generally been a prerequisite step\nfor linear ICA methods, which meant that conventional nonlinear PCA could not\nnecessarily preserve the orthogonality of the overall transformation, could not\ndirectly reduce dimensionality, and could not intrinsically order by variances.\nWe offer insights on the relationship between linear PCA, nonlinear PCA, and\nlinear ICA -- three methods with autoencoder formulations for learning special\nlinear transformations from data, transformations that are (semi-)orthogonal\nfor PCA, and arbitrary unit-variance for ICA. As part of our formulation,\nnonlinear PCA can be seen as a method that maximizes both variance and\nstatistical independence, lying in the middle between linear PCA and linear\nICA, serving as a building block for learning linear transformations that are\nidentifiable.\n","authors":["Fahdi Kanavati","Lucy Katsnith","Masayuki Tsuneki"],"pdf_url":"https://arxiv.org/pdf/2311.13580v4.pdf","comment":"Update with published version"},{"id":"http://arxiv.org/abs/2402.07025v3","updated":"2024-07-01T09:27:34Z","published":"2024-02-10T19:12:31Z","title":"Generalization Error of Graph Neural Networks in the Mean-field Regime","summary":" This work provides a theoretical framework for assessing the generalization\nerror of graph neural networks in the over-parameterized regime, where the\nnumber of parameters surpasses the quantity of data points. We explore two\nwidely utilized types of graph neural networks: graph convolutional neural\nnetworks and message passing graph neural networks. Prior to this study,\nexisting bounds on the generalization error in the over-parametrized regime\nwere uninformative, limiting our understanding of over-parameterized network\nperformance. Our novel approach involves deriving upper bounds within the\nmean-field regime for evaluating the generalization error of these graph neural\nnetworks. We establish upper bounds with a convergence rate of $O(1/n)$, where\n$n$ is the number of graph samples. These upper bounds offer a theoretical\nassurance of the networks' performance on unseen data in the challenging\nover-parameterized regime and overall contribute to our understanding of their\nperformance.\n","authors":["Gholamali Aminian","Yixuan He","Gesine Reinert","Łukasz Szpruch","Samuel N. Cohen"],"pdf_url":"https://arxiv.org/pdf/2402.07025v3.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2406.14969v2","updated":"2024-07-01T09:08:44Z","published":"2024-06-21T08:28:54Z","title":"Uni-Mol2: Exploring Molecular Pretraining Model at Scale","summary":" In recent years, pretraining models have made significant advancements in the\nfields of natural language processing (NLP), computer vision (CV), and life\nsciences. The significant advancements in NLP and CV are predominantly driven\nby the expansion of model parameters and data size, a phenomenon now recognized\nas the scaling laws. However, research exploring scaling law in molecular\npretraining models remains unexplored. In this work, we present Uni-Mol2 , an\ninnovative molecular pretraining model that leverages a two-track transformer\nto effectively integrate features at the atomic level, graph level, and\ngeometry structure level. Along with this, we systematically investigate the\nscaling law within molecular pretraining models, characterizing the power-law\ncorrelations between validation loss and model size, dataset size, and\ncomputational resources. Consequently, we successfully scale Uni-Mol2 to 1.1\nbillion parameters through pretraining on 800 million conformations, making it\nthe largest molecular pretraining model to date. Extensive experiments show\nconsistent improvement in the downstream tasks as the model size grows. The\nUni-Mol2 with 1.1B parameters also outperforms existing methods, achieving an\naverage 27% improvement on the QM9 and 14% on COMPAS-1D dataset.\n","authors":["Xiaohong Ji","Zhen Wang","Zhifeng Gao","Hang Zheng","Linfeng Zhang","Guolin Ke","Weinan E"],"pdf_url":"https://arxiv.org/pdf/2406.14969v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06212v5","updated":"2024-07-01T08:58:07Z","published":"2023-09-12T13:28:06Z","title":"Long-term drought prediction using deep neural networks based on\n geospatial weather data","summary":" The problem of high-quality drought forecasting up to a year in advance is\ncritical for agriculture planning and insurance. Yet, it is still unsolved with\nreasonable accuracy due to data complexity and aridity stochasticity. We tackle\ndrought data by introducing an end-to-end approach that adopts a\nspatio-temporal neural network model with accessible open monthly climate data\nas the input.\n Our systematic research employs diverse proposed models and five distinct\nenvironmental regions as a testbed to evaluate the efficacy of the Palmer\nDrought Severity Index (PDSI) prediction. Key aggregated findings are the\nexceptional performance of a Transformer model, EarthFormer, in making accurate\nshort-term (up to six months) forecasts. At the same time, the Convolutional\nLSTM excels in longer-term forecasting. Both models achieved high ROC AUC\nscores: 0.948 for one month ahead and 0.617 for twelve months ahead forecasts,\nbecoming closer to perfect ROC-AUC by $54\\%$ and $16\\%$, respectively, c.t.\nclassic approaches.\n","authors":["Alexander Marusov","Vsevolod Grabar","Yury Maximov","Nazar Sotiriadi","Alexander Bulkin","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2309.06212v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09134v2","updated":"2024-07-01T08:24:41Z","published":"2023-09-17T02:12:36Z","title":"Total Variation Distance Meets Probabilistic Inference","summary":" In this paper, we establish a novel connection between total variation (TV)\ndistance estimation and probabilistic inference. In particular, we present an\nefficient, structure-preserving reduction from relative approximation of TV\ndistance to probabilistic inference over directed graphical models. This\nreduction leads to a fully polynomial randomized approximation scheme (FPRAS)\nfor estimating TV distances between same-structure distributions over any class\nof Bayes nets for which there is an efficient probabilistic inference\nalgorithm. In particular, it leads to an FPRAS for estimating TV distances\nbetween distributions that are defined over a common Bayes net of small\ntreewidth. Prior to this work, such approximation schemes only existed for\nestimating TV distances between product distributions. Our approach employs a\nnew notion of $partial$ couplings of high-dimensional distributions, which\nmight be of independent interest.\n","authors":["Arnab Bhattacharyya","Sutanu Gayen","Kuldeep S. Meel","Dimitrios Myrisiotis","A. Pavan","N. V. Vinodchandran"],"pdf_url":"https://arxiv.org/pdf/2309.09134v2.pdf","comment":"25 pages. This work has been accepted for presentation at the\n International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2307.11465v5","updated":"2024-07-01T08:01:56Z","published":"2023-07-21T10:01:55Z","title":"A Deep Learning Approach for Overall Survival Prediction in Lung Cancer\n with Missing Values","summary":" In the field of lung cancer research, particularly in the analysis of overall\nsurvival (OS), artificial intelligence (AI) serves crucial roles with specific\naims. Given the prevalent issue of missing data in the medical domain, our\nprimary objective is to develop an AI model capable of dynamically handling\nthis missing data. Additionally, we aim to leverage all accessible data,\neffectively analyzing both uncensored patients who have experienced the event\nof interest and censored patients who have not, by embedding a specialized\ntechnique within our AI model, not commonly utilized in other AI tasks. Through\nthe realization of these objectives, our model aims to provide precise OS\npredictions for non-small cell lung cancer (NSCLC) patients, thus overcoming\nthese significant challenges. We present a novel approach to survival analysis\nwith missing values in the context of NSCLC, which exploits the strengths of\nthe transformer architecture to account only for available features without\nrequiring any imputation strategy. More specifically, this model tailors the\ntransformer architecture to tabular data by adapting its feature embedding and\nmasked self-attention to mask missing data and fully exploit the available\nones. By making use of ad-hoc designed losses for OS, it is able to account for\nboth censored and uncensored patients, as well as changes in risks over time.\nWe compared our method with state-of-the-art models for survival analysis\ncoupled with different imputation strategies. We evaluated the results obtained\nover a period of 6 years using different time granularities obtaining a\nCt-index, a time-dependent variant of the C-index, of 71.97, 77.58 and 80.72\nfor time units of 1 month, 1 year and 2 years, respectively, outperforming all\nstate-of-the-art methods regardless of the imputation method used.\n","authors":["Camillo Maria Caruso","Valerio Guarrasi","Sara Ramella","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2307.11465v5.pdf","comment":"24 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.15993v3","updated":"2024-07-01T07:55:40Z","published":"2024-02-25T05:22:45Z","title":"Model Compression Method for S4 with Diagonal State Space Layers using\n Balanced Truncation","summary":" To implement deep learning models on edge devices, model compression methods\nhave been widely recognized as useful. However, it remains unclear which model\ncompression methods are effective for Structured State Space Sequence (S4)\nmodels incorporating Diagonal State Space (DSS) layers, tailored for processing\nlong-sequence data. In this paper, we propose to use the balanced truncation, a\nprevalent model reduction technique in control theory, applied specifically to\nDSS layers in pre-trained S4 model as a novel model compression method.\nMoreover, we propose using the reduced model parameters obtained by the\nbalanced truncation as initial parameters of S4 models with DSS layers during\nthe main training process. Numerical experiments demonstrate that our trained\nmodels combined with the balanced truncation surpass conventionally trained\nmodels with Skew-HiPPO initialization in accuracy, even with fewer parameters.\nFurthermore, our observations reveal a positive correlation: higher accuracy in\nthe original model consistently leads to increased accuracy in models trained\nusing our model compression method, suggesting that our approach effectively\nleverages the strengths of the original model.\n","authors":["Haruka Ezoe","Kazuhiro Sato"],"pdf_url":"https://arxiv.org/pdf/2402.15993v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12975v2","updated":"2024-07-01T07:40:03Z","published":"2024-02-05T12:11:15Z","title":"Training morphological neural networks with gradient descent: some\n theoretical insights","summary":" Morphological neural networks, or layers, can be a powerful tool to boost the\nprogress in mathematical morphology, either on theoretical aspects such as the\nrepresentation of complete lattice operators, or in the development of image\nprocessing pipelines. However, these architectures turn out to be difficult to\ntrain when they count more than a few morphological layers, at least within\npopular machine learning frameworks which use gradient descent based\noptimization algorithms. In this paper we investigate the potential and\nlimitations of differentiation based approaches and back-propagation applied to\nmorphological networks, in light of the non-smooth optimization concept of\nBouligand derivative. We provide insights and first theoretical guidelines, in\nparticular regarding initialization and learning rates.\n","authors":["Samy Blusseau"],"pdf_url":"https://arxiv.org/pdf/2403.12975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12694v4","updated":"2024-07-01T07:39:50Z","published":"2024-02-20T03:45:59Z","title":"Revitalizing Multivariate Time Series Forecasting: Learnable\n Decomposition with Inter-Series Dependencies and Intra-Series Variations\n Modeling","summary":" Predicting multivariate time series is crucial, demanding precise modeling of\nintricate patterns, including inter-series dependencies and intra-series\nvariations. Distinctive trend characteristics in each time series pose\nchallenges, and existing methods, relying on basic moving average kernels, may\nstruggle with the non-linear structure and complex trends in real-world data.\nGiven that, we introduce a learnable decomposition strategy to capture dynamic\ntrend information more reasonably. Additionally, we propose a dual attention\nmodule tailored to capture inter-series dependencies and intra-series\nvariations simultaneously for better time series forecasting, which is\nimplemented by channel-wise self-attention and autoregressive self-attention.\nTo evaluate the effectiveness of our method, we conducted experiments across\neight open-source datasets and compared it with the state-of-the-art methods.\nThrough the comparison results, our Leddam (LEarnable Decomposition and Dual\nAttention Module) not only demonstrates significant advancements in predictive\nperformance, but also the proposed decomposition strategy can be plugged into\nother methods with a large performance-boosting, from 11.87% to 48.56% MSE\nerror degradation.\n","authors":["Guoqi Yu","Jing Zou","Xiaowei Hu","Angelica I. Aviles-Rivero","Jing Qin","Shujun Wang"],"pdf_url":"https://arxiv.org/pdf/2402.12694v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18380v2","updated":"2024-07-01T07:13:08Z","published":"2024-06-26T14:21:21Z","title":"KAGNNs: Kolmogorov-Arnold Networks meet Graph Learning","summary":" In recent years, Graph Neural Networks (GNNs) have become the de facto tool\nfor learning node and graph representations. Most GNNs typically consist of a\nsequence of neighborhood aggregation (a.k.a., message passing) layers. Within\neach of these layers, the representation of each node is updated from an\naggregation and transformation of its neighbours representations at the\nprevious layer. The upper bound for the expressive power of message passing\nGNNs was reached through the use of MLPs as a transformation, due to their\nuniversal approximation capabilities. However, MLPs suffer from well-known\nlimitations, which recently motivated the introduction of Kolmogorov-Arnold\nNetworks (KANs). KANs rely on the Kolmogorov-Arnold representation theorem,\nrendering them a promising alternative to MLPs. In this work, we compare the\nperformance of KANs against that of MLPs in graph learning tasks. We perform\nextensive experiments on node classification, graph classification and graph\nregression datasets. Our preliminary results indicate that while KANs are\non-par with MLPs in classification tasks, they seem to have a clear advantage\nin the graph regression tasks. Code is available at https:\n//github.com/RomanBresson/KAGNN.\n","authors":["Roman Bresson","Giannis Nikolentzos","George Panagopoulos","Michail Chatzianastasis","Jun Pang","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2406.18380v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18664v2","updated":"2024-07-01T07:12:45Z","published":"2024-06-26T18:09:46Z","title":"Evaluating Copyright Takedown Methods for Language Models","summary":" Language models (LMs) derive their capabilities from extensive training on\ndiverse data, including potentially copyrighted material. These models can\nmemorize and generate content similar to their training data, posing potential\nconcerns. Therefore, model creators are motivated to develop mitigation methods\nthat prevent generating protected content. We term this procedure as copyright\ntakedowns for LMs, noting the conceptual similarity to (but legal distinction\nfrom) the DMCA takedown This paper introduces the first evaluation of the\nfeasibility and side effects of copyright takedowns for LMs. We propose\nCoTaEval, an evaluation framework to assess the effectiveness of copyright\ntakedown methods, the impact on the model's ability to retain uncopyrightable\nfactual knowledge from the training data whose recitation is embargoed, and how\nwell the model maintains its general utility and efficiency. We examine several\nstrategies, including adding system prompts, decoding-time filtering\ninterventions, and unlearning approaches. Our findings indicate that no tested\nmethod excels across all metrics, showing significant room for research in this\nunique problem setting and indicating potential unresolved challenges for live\npolicy proposals.\n","authors":["Boyi Wei","Weijia Shi","Yangsibo Huang","Noah A. Smith","Chiyuan Zhang","Luke Zettlemoyer","Kai Li","Peter Henderson"],"pdf_url":"https://arxiv.org/pdf/2406.18664v2.pdf","comment":"31 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2402.05162v3","updated":"2024-07-01T07:11:17Z","published":"2024-02-07T18:34:38Z","title":"Assessing the Brittleness of Safety Alignment via Pruning and Low-Rank\n Modifications","summary":" Large language models (LLMs) show inherent brittleness in their safety\nmechanisms, as evidenced by their susceptibility to jailbreaking and even\nnon-malicious fine-tuning. This study explores this brittleness of safety\nalignment by leveraging pruning and low-rank modifications. We develop methods\nto identify critical regions that are vital for safety guardrails, and that are\ndisentangled from utility-relevant regions at both the neuron and rank levels.\nSurprisingly, the isolated regions we find are sparse, comprising about $3\\%$\nat the parameter level and $2.5\\%$ at the rank level. Removing these regions\ncompromises safety without significantly impacting utility, corroborating the\ninherent brittleness of the model's safety mechanisms. Moreover, we show that\nLLMs remain vulnerable to low-cost fine-tuning attacks even when modifications\nto the safety-critical regions are restricted. These findings underscore the\nurgent need for more robust safety strategies in LLMs.\n","authors":["Boyi Wei","Kaixuan Huang","Yangsibo Huang","Tinghao Xie","Xiangyu Qi","Mengzhou Xia","Prateek Mittal","Mengdi Wang","Peter Henderson"],"pdf_url":"https://arxiv.org/pdf/2402.05162v3.pdf","comment":"22 pages, 9 figures. Project page is available at\n https://boyiwei.com/alignment-attribution/"},{"id":"http://arxiv.org/abs/2404.09562v2","updated":"2024-07-01T06:46:36Z","published":"2024-04-15T08:22:47Z","title":"σ-GPTs: A New Approach to Autoregressive Models","summary":" Autoregressive models, such as the GPT family, use a fixed order, usually\nleft-to-right, to generate sequences. However, this is not a necessity. In this\npaper, we challenge this assumption and show that by simply adding a positional\nencoding for the output, this order can be modulated on-the-fly per-sample\nwhich offers key advantageous properties. It allows for the sampling of and\nconditioning on arbitrary subsets of tokens, and it also allows sampling in one\nshot multiple tokens dynamically according to a rejection strategy, leading to\na sub-linear number of model evaluations. We evaluate our method across various\ndomains, including language modeling, path-solving, and aircraft vertical rate\nprediction, decreasing the number of steps required for generation by an order\nof magnitude.\n","authors":["Arnaud Pannatier","Evann Courdier","François Fleuret"],"pdf_url":"https://arxiv.org/pdf/2404.09562v2.pdf","comment":"23 pages, 7 figures, accepted at ECML/PKDD 2024"},{"id":"http://arxiv.org/abs/2402.05330v2","updated":"2024-07-01T05:51:25Z","published":"2024-02-08T00:12:18Z","title":"Classification under Nuisance Parameters and Generalized Label Shift in\n Likelihood-Free Inference","summary":" An open scientific challenge is how to classify events with reliable measures\nof uncertainty, when we have a mechanistic model of the data-generating process\nbut the distribution over both labels and latent nuisance parameters is\ndifferent between train and target data. We refer to this type of\ndistributional shift as generalized label shift (GLS). Direct classification\nusing observed data $\\mathbf{X}$ as covariates leads to biased predictions and\ninvalid uncertainty estimates of labels $Y$. We overcome these biases by\nproposing a new method for robust uncertainty quantification that casts\nclassification as a hypothesis testing problem under nuisance parameters. The\nkey idea is to estimate the classifier's receiver operating characteristic\n(ROC) across the entire nuisance parameter space, which allows us to devise\ncutoffs that are invariant under GLS. Our method effectively endows a\npre-trained classifier with domain adaptation capabilities and returns valid\nprediction sets while maintaining high power. We demonstrate its performance on\ntwo challenging scientific problems in biology and astroparticle physics with\ndata from realistic mechanistic models.\n","authors":["Luca Masserano","Alex Shen","Michele Doro","Tommaso Dorigo","Rafael Izbicki","Ann B. Lee"],"pdf_url":"https://arxiv.org/pdf/2402.05330v2.pdf","comment":"26 pages, 19 figures, code available at\n https://github.com/lee-group-cmu/lf2i"},{"id":"http://arxiv.org/abs/2406.18665v2","updated":"2024-07-01T05:38:08Z","published":"2024-06-26T18:10:22Z","title":"RouteLLM: Learning to Route LLMs with Preference Data","summary":" Large language models (LLMs) exhibit impressive capabilities across a wide\nrange of tasks, yet the choice of which model to use often involves a trade-off\nbetween performance and cost. More powerful models, though effective, come with\nhigher expenses, while less capable models are more cost-effective. To address\nthis dilemma, we propose several efficient router models that dynamically\nselect between a stronger and a weaker LLM during inference, aiming to optimize\nthe balance between cost and response quality. We develop a training framework\nfor these routers leveraging human preference data and data augmentation\ntechniques to enhance performance. Our evaluation on widely-recognized\nbenchmarks shows that our approach significantly reduces costs-by over 2 times\nin certain cases-without compromising the quality of responses. Interestingly,\nour router models also demonstrate significant transfer learning capabilities,\nmaintaining their performance even when the strong and weak models are changed\nat test time. This highlights the potential of these routers to provide a\ncost-effective yet high-performance solution for deploying LLMs.\n","authors":["Isaac Ong","Amjad Almahairi","Vincent Wu","Wei-Lin Chiang","Tianhao Wu","Joseph E. Gonzalez","M Waleed Kadous","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2406.18665v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.13836v3","updated":"2024-07-01T05:06:56Z","published":"2022-09-28T04:54:37Z","title":"Mutual Information Assisted Ensemble Recommender System for Identifying\n Critical Risk Factors in Healthcare Prognosis","summary":" Purpose: Health recommenders act as important decision support systems,\naiding patients and medical professionals in taking actions that lead to\npatients' well-being. These systems extract the information which may be of\nparticular relevance to the end-user, helping them in making appropriate\ndecisions. The present study proposes a feature recommender, as a part of a\ndisease management system, that identifies and recommends the most important\nrisk factors for an illness.\n Methods: A novel mutual information and ensemble-based feature ranking\napproach for identifying critical risk factors in healthcare prognosis is\nproposed.\n Results: To establish the effectiveness of the proposed method, experiments\nhave been conducted on four benchmark datasets of diverse diseases (clear cell\nrenal cell carcinoma (ccRCC), chronic kidney disease, Indian liver patient, and\ncervical cancer risk factors). The performance of the proposed recommender is\ncompared with four state-of-the-art methods using recommender systems'\nperformance metrics like average precision@K, precision@K, recall@K, F1@K,\nreciprocal rank@K. The method is able to recommend all relevant critical risk\nfactors for ccRCC. It also attains a higher accuracy (96.6% and 98.6% using\nsupport vector machine and neural network, respectively) for ccRCC staging with\na reduced feature set as compared to existing methods. Moreover, the top two\nfeatures recommended using the proposed method with ccRCC, viz. size of tumor\nand metastasis status, are medically validated from the existing TNM system.\nResults are also found to be superior for the other three datasets.\n Conclusion: The proposed recommender can identify and recommend risk factors\nthat have the most discriminating power for detecting diseases.\n","authors":["Abhishek Dey","Debayan Goswami","Rahul Roy","Susmita Ghosh","Yu Shrike Zhang","Jonathan H. Chan"],"pdf_url":"https://arxiv.org/pdf/2209.13836v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01041v2","updated":"2024-07-01T05:06:32Z","published":"2024-05-02T06:53:40Z","title":"Efficient and Flexible Method for Reducing Moderate-size Deep Neural\n Networks with Condensation","summary":" Neural networks have been extensively applied to a variety of tasks,\nachieving astounding results. Applying neural networks in the scientific field\nis an important research direction that is gaining increasing attention. In\nscientific applications, the scale of neural networks is generally\nmoderate-size, mainly to ensure the speed of inference during application.\nAdditionally, comparing neural networks to traditional algorithms in scientific\napplications is inevitable. These applications often require rapid\ncomputations, making the reduction of neural network sizes increasingly\nimportant. Existing work has found that the powerful capabilities of neural\nnetworks are primarily due to their non-linearity. Theoretical work has\ndiscovered that under strong non-linearity, neurons in the same layer tend to\nbehave similarly, a phenomenon known as condensation. Condensation offers an\nopportunity to reduce the scale of neural networks to a smaller subnetwork with\nsimilar performance. In this article, we propose a condensation reduction\nalgorithm to verify the feasibility of this idea in practical problems. Our\nreduction method can currently be applied to both fully connected networks and\nconvolutional networks, achieving positive results. In complex combustion\nacceleration tasks, we reduced the size of the neural network to 41.7% of its\noriginal scale while maintaining prediction accuracy. In the CIFAR10 image\nclassification task, we reduced the network size to 11.5% of the original\nscale, still maintaining a satisfactory validation accuracy. Our method can be\napplied to most trained neural networks, reducing computational pressure and\nimproving inference speed.\n","authors":["Tianyi Chen","Zhi-Qin John Xu"],"pdf_url":"https://arxiv.org/pdf/2405.01041v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13314v2","updated":"2024-07-01T05:05:59Z","published":"2022-11-23T21:33:34Z","title":"CoMadOut -- A Robust Outlier Detection Algorithm based on CoMAD","summary":" Unsupervised learning methods are well established in the area of anomaly\ndetection and achieve state of the art performances on outlier datasets.\nOutliers play a significant role, since they bear the potential to distort the\npredictions of a machine learning algorithm on a given dataset. Especially\namong PCA-based methods, outliers have an additional destructive potential\nregarding the result: they may not only distort the orientation and translation\nof the principal components, they also make it more complicated to detect\noutliers. To address this problem, we propose the robust outlier detection\nalgorithm CoMadOut, which satisfies two required properties: (1) being robust\ntowards outliers and (2) detecting them. Our CoMadOut outlier detection\nvariants using comedian PCA define, dependent on its variant, an inlier region\nwith a robust noise margin by measures of in-distribution (variant CMO) and\noptimized scores by measures of out-of-distribution (variants CMO*), e.g.\nkurtosis-weighting by CMO+k. These measures allow distribution based outlier\nscoring for each principal component, and thus, an appropriate alignment of the\ndegree of outlierness between normal and abnormal instances. Experiments\ncomparing CoMadOut with traditional, deep and other comparable robust outlier\ndetection methods showed that the performance of the introduced CoMadOut\napproach is competitive to well established methods related to average\nprecision (AP), area under the precision recall curve (AUPRC) and area under\nthe receiver operating characteristic (AUROC) curve. In summary our approach\ncan be seen as a robust alternative for outlier detection tasks.\n","authors":["Andreas Lohrer","Daniyal Kazempour","Maximilian Hünemörder","Peer Kröger"],"pdf_url":"https://arxiv.org/pdf/2211.13314v2.pdf","comment":"published in Springer Machine Learning Journal (MLJ)"},{"id":"http://arxiv.org/abs/2406.19549v2","updated":"2024-07-01T04:52:56Z","published":"2024-06-27T22:01:00Z","title":"ASCENT: Amplifying Power Side-Channel Resilience via Learning &\n Monte-Carlo Tree Search","summary":" Power side-channel (PSC) analysis is pivotal for securing cryptographic\nhardware. Prior art focused on securing gate-level netlists obtained as-is from\nchip design automation, neglecting all the complexities and potential\nside-effects for security arising from the design automation process. That is,\nautomation traditionally prioritizes power, performance, and area (PPA),\nsidelining security. We propose a \"security-first\" approach, refining the logic\nsynthesis stage to enhance the overall resilience of PSC countermeasures. We\nintroduce ASCENT, a learning-and-search-based framework that (i) drastically\nreduces the time for post-design PSC evaluation and (ii) explores the\nsecurity-vs-PPA design space. Thus, ASCENT enables an efficient exploration of\na large number of candidate netlists, leading to an improvement in PSC\nresilience compared to regular PPA-optimized netlists. ASCENT is up to 120x\nfaster than traditional PSC analysis and yields a 3.11x improvement for PSC\nresilience of state-of-the-art PSC countermeasures\n","authors":["Jitendra Bhandari","Animesh Basak Chowdhury","Mohammed Nabeel","Ozgur Sinanoglu","Siddharth Garg","Ramesh Karri","Johann Knechtel"],"pdf_url":"https://arxiv.org/pdf/2406.19549v2.pdf","comment":"Accepted at 2024 ACM/IEEE International Conference on Computer-Aided\n Design"},{"id":"http://arxiv.org/abs/2406.04824v2","updated":"2024-07-01T04:48:24Z","published":"2024-06-07T10:49:59Z","title":"FunBO: Discovering Acquisition Functions for Bayesian Optimization with\n FunSearch","summary":" The sample efficiency of Bayesian optimization algorithms depends on\ncarefully crafted acquisition functions (AFs) guiding the sequential collection\nof function evaluations. The best-performing AF can vary significantly across\noptimization problems, often requiring ad-hoc and problem-specific choices.\nThis work tackles the challenge of designing novel AFs that perform well across\na variety of experimental settings. Based on FunSearch, a recent work using\nLarge Language Models (LLMs) for discovery in mathematical sciences, we propose\nFunBO, an LLM-based method that can be used to learn new AFs written in\ncomputer code by leveraging access to a limited number of evaluations for a set\nof objective functions. We provide the analytic expression of all discovered\nAFs and evaluate them on various global optimization benchmarks and\nhyperparameter optimization tasks. We show how FunBO identifies AFs that\ngeneralize well in and out of the training distribution of functions, thus\noutperforming established general-purpose AFs and achieving competitive\nperformance against AFs that are customized to specific function types and are\nlearned via transfer-learning algorithms.\n","authors":["Virginia Aglietti","Ira Ktena","Jessica Schrouff","Eleni Sgouritsa","Francisco J. R. Ruiz","Alan Malek","Alexis Bellot","Silvia Chiappa"],"pdf_url":"https://arxiv.org/pdf/2406.04824v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17480v2","updated":"2024-07-01T04:35:14Z","published":"2024-03-26T08:22:09Z","title":"Capacity Provisioning Motivated Online Non-Convex Optimization Problem\n with Memory and Switching Cost","summary":" An online non-convex optimization problem is considered where the goal is to\nminimize the flow time (total delay) of a set of jobs by modulating the number\nof active servers, but with a switching cost associated with changing the\nnumber of active servers over time. Each job can be processed by at most one\nfixed speed server at any time. Compared to the usual online convex\noptimization (OCO) problem with switching cost, the objective function\nconsidered is non-convex and more importantly, at each time, it depends on all\npast decisions and not just the present one. Both worst-case and stochastic\ninputs are considered; for both cases, competitive algorithms are derived.\n","authors":["Rahul Vaze","Jayakrishnan Nair"],"pdf_url":"https://arxiv.org/pdf/2403.17480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13300v3","updated":"2024-07-01T04:01:11Z","published":"2024-05-22T02:37:02Z","title":"FAITH: Frequency-domain Attention In Two Horizons for Time Series\n Forecasting","summary":" Time Series Forecasting plays a crucial role in various fields such as\nindustrial equipment maintenance, meteorology, energy consumption, traffic flow\nand financial investment. However, despite their considerable advantages over\ntraditional statistical approaches, current deep learning-based predictive\nmodels often exhibit a significant deviation between their forecasting outcomes\nand the ground truth. This discrepancy is largely due to an insufficient\nemphasis on extracting the sequence's latent information, particularly its\nglobal information within the frequency domain and the relationship between\ndifferent variables. To address this issue, we propose a novel model\nFrequency-domain Attention In Two Horizons, which decomposes time series into\ntrend and seasonal components using a multi-scale sequence adaptive\ndecomposition and fusion architecture, and processes them separately. FAITH\nutilizes Frequency Channel feature Extraction Module and Frequency Temporal\nfeature Extraction Module to capture inter-channel relationships and temporal\nglobal information in the sequence, significantly improving its ability to\nhandle long-term dependencies and complex patterns. Furthermore, FAITH achieves\ntheoretically linear complexity by modifying the time-frequency domain\ntransformation method, effectively reducing computational costs. Extensive\nexperiments on 6 benchmarks for long-term forecasting and 3 benchmarks for\nshort-term forecasting demonstrate that FAITH outperforms existing models in\nmany fields, such as electricity, weather and traffic, proving its\neffectiveness and superiority both in long-term and short-term time series\nforecasting tasks. Our codes and data are available at\nhttps://github.com/LRQ577/FAITH.\n","authors":["Ruiqi Li","Maowei Jiang","Kai Wang","Kaiduo Feng","Quangao Liu","Yue Sun","Xiufang Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.13300v3.pdf","comment":"We think there are some errors in the experiment result, it may lead\n to a wrong conclusion. So we think it will be responsible to withdraw it"},{"id":"http://arxiv.org/abs/2303.01504v3","updated":"2024-07-01T03:33:55Z","published":"2023-03-01T12:31:07Z","title":"Backdoor for Debias: Mitigating Model Bias with Backdoor Attack-based\n Artificial Bias","summary":" With the swift advancement of deep learning, state-of-the-art algorithms have\nbeen utilized in various social situations. Nonetheless, some algorithms have\nbeen discovered to exhibit biases and provide unequal results. The current\ndebiasing methods face challenges such as poor utilization of data or intricate\ntraining requirements. In this work, we found that the backdoor attack can\nconstruct an artificial bias similar to the model bias derived in standard\ntraining. Considering the strong adjustability of backdoor triggers, we are\nmotivated to mitigate the model bias by carefully designing reverse artificial\nbias created from backdoor attack. Based on this, we propose a backdoor\ndebiasing framework based on knowledge distillation, which effectively reduces\nthe model bias from original data and minimizes security risks from the\nbackdoor attack. The proposed solution is validated on both image and\nstructured datasets, showing promising results. This work advances the\nunderstanding of backdoor attacks and highlights its potential for beneficial\napplications. The code for the study can be found at\n\\url{https://anonymous.4open.science/r/DwB-BC07/}.\n","authors":["Shangxi Wu","Qiuyang He","Jian Yu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2303.01504v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16522v3","updated":"2024-07-01T03:21:38Z","published":"2024-05-26T11:17:49Z","title":"Multi-State TD Target for Model-Free Reinforcement Learning","summary":" Temporal difference (TD) learning is a fundamental technique in reinforcement\nlearning that updates value estimates for states or state-action pairs using a\nTD target. This target represents an improved estimate of the true value by\nincorporating both immediate rewards and the estimated value of subsequent\nstates. Traditionally, TD learning relies on the value of a single subsequent\nstate. We propose an enhanced multi-state TD (MSTD) target that utilizes the\nestimated values of multiple subsequent states. Building on this new MSTD\nconcept, we develop complete actor-critic algorithms that include management of\nreplay buffers in two modes, and integrate with deep deterministic policy\noptimization (DDPG) and soft actor-critic (SAC). Experimental results\ndemonstrate that algorithms employing the MSTD target significantly improve\nlearning performance compared to traditional methods.The code is provided on\nGitHub.\n","authors":["Wuhao Wang","Zhiyong Chen","Lepeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.16522v3.pdf","comment":"8 pages, 16 figures"},{"id":"http://arxiv.org/abs/2307.10529v2","updated":"2024-07-01T03:10:34Z","published":"2023-07-20T02:07:20Z","title":"Fast Unsupervised Deep Outlier Model Selection with Hypernetworks","summary":" Outlier detection (OD) finds many applications with a rich literature of\nnumerous techniques. Deep neural network based OD (DOD) has seen a recent surge\nof attention thanks to the many advances in deep learning. In this paper, we\nconsider a critical-yet-understudied challenge with unsupervised DOD, that is,\neffective hyperparameter (HP) tuning/model selection. While several prior work\nreport the sensitivity of OD models to HPs, it becomes ever so critical for the\nmodern DOD models that exhibit a long list of HPs. We introduce HYPER for\ntuning DOD models, tackling two fundamental challenges: (1) validation without\nsupervision (due to lack of labeled anomalies), and (2) efficient search of the\nHP/model space (due to exponential growth in the number of HPs). A key idea is\nto design and train a novel hypernetwork (HN) that maps HPs onto optimal\nweights of the main DOD model. In turn, HYPER capitalizes on a single HN that\ncan dynamically generate weights for many DOD models (corresponding to varying\nHPs), which offers significant speed-up. In addition, it employs meta-learning\non historical OD tasks with labels to train a proxy validation function,\nlikewise trained with our proposed HN efficiently. Extensive experiments on 35\nOD tasks show that HYPER achieves high performance against 8 baselines with\nsignificant efficiency gains.\n","authors":["Xueying Ding","Yue Zhao","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2307.10529v2.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.11838v5","updated":"2024-07-01T02:51:58Z","published":"2024-02-19T05:04:11Z","title":"UniST: A Prompt-Empowered Universal Model for Urban Spatio-Temporal\n Prediction","summary":" Urban spatio-temporal prediction is crucial for informed decision-making,\nsuch as traffic management, resource optimization, and emergence response.\nDespite remarkable breakthroughs in pretrained natural language models that\nenable one model to handle diverse tasks, a universal solution for\nspatio-temporal prediction remains challenging Existing prediction approaches\nare typically tailored for specific spatio-temporal scenarios, requiring\ntask-specific model designs and extensive domain-specific training data. In\nthis study, we introduce UniST, a universal model designed for general urban\nspatio-temporal prediction across a wide range of scenarios. Inspired by large\nlanguage models, UniST achieves success through: (i) utilizing diverse\nspatio-temporal data from different scenarios, (ii) effective pre-training to\ncapture complex spatio-temporal dynamics, (iii) knowledge-guided prompts to\nenhance generalization capabilities. These designs together unlock the\npotential of building a universal model for various scenarios Extensive\nexperiments on more than 20 spatio-temporal scenarios demonstrate UniST's\nefficacy in advancing state-of-the-art performance, especially in few-shot and\nzero-shot prediction. The datasets and code implementation are released on\nhttps://github.com/tsinghua-fib-lab/UniST.\n","authors":["Yuan Yuan","Jingtao Ding","Jie Feng","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2402.11838v5.pdf","comment":"2024 ACM SIGKDD International Conference on Knowledge Discovery and\n Data Mining, KDD 2024"},{"id":"http://arxiv.org/abs/2402.16710v2","updated":"2024-07-01T02:35:19Z","published":"2024-02-26T16:27:08Z","title":"Cost Aware Best Arm Identification","summary":" In this paper, we study a best arm identification problem with dual objects.\nIn addition to the classic reward, each arm is associated with a cost\ndistribution and the goal is to identify the largest reward arm using the\nminimum expected cost. We call it \\emph{Cost Aware Best Arm Identification}\n(CABAI), which captures the separation of testing and implementation phases in\nproduct development pipelines and models the objective shift between phases,\ni.e., cost for testing and reward for implementation. We first derive a\ntheoretical lower bound for CABAI and propose an algorithm called\n$\\mathsf{CTAS}$ to match it asymptotically. To reduce the computation of\n$\\mathsf{CTAS}$, we further propose a simple algorithm called \\emph{Chernoff\nOverlap} (CO), based on a square-root rule, which we prove is optimal in\nsimplified two-armed models and generalizes well in numerical experiments. Our\nresults show that (i) ignoring the heterogeneous action cost results in\nsub-optimality in practice, and (ii) simple algorithms can deliver near-optimal\nperformance over a wide range of problems.\n","authors":["Kellen Kanarios","Qining Zhang","Lei Ying"],"pdf_url":"https://arxiv.org/pdf/2402.16710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02798v2","updated":"2024-07-01T02:19:36Z","published":"2023-11-05T23:47:52Z","title":"From molecules to scaffolds to functional groups: building\n context-dependent molecular representation via multi-channel learning","summary":" Reliable molecular property prediction is essential for various scientific\nendeavors and industrial applications, such as drug discovery. However, the\ndata scarcity, combined with the highly non-linear causal relationships between\nphysicochemical and biological properties and conventional molecular\nfeaturization schemes, complicates the development of robust molecular machine\nlearning models. Self-supervised learning (SSL) has emerged as a popular\nsolution, utilizing large-scale, unannotated molecular data to learn a\nfoundational representation of chemical space that might be advantageous for\ndownstream tasks. Yet, existing molecular SSL methods largely overlook chemical\nknowledge, including molecular structure similarity, scaffold composition, and\nthe context-dependent aspects of molecular properties when operating over the\nchemical space. They also struggle to learn the subtle variations in\nstructure-activity relationship. This paper introduces a novel pre-training\nframework that learns robust and generalizable chemical knowledge. It leverages\nthe structural hierarchy within the molecule, embeds them through distinct\npre-training tasks across channels, and aggregates channel information in a\ntask-specific manner during fine-tuning. Our approach demonstrates competitive\nperformance across various molecular property benchmarks and offers strong\nadvantages in particularly challenging yet ubiquitous scenarios like activity\ncliffs.\n","authors":["Yue Wan","Jialu Wu","Tingjun Hou","Chang-Yu Hsieh","Xiaowei Jia"],"pdf_url":"https://arxiv.org/pdf/2311.02798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18334v3","updated":"2024-07-01T02:10:50Z","published":"2024-05-28T16:28:51Z","title":"SketchQL Demonstration: Zero-shot Video Moment Querying with Sketches","summary":" In this paper, we will present SketchQL, a video database management system\n(VDBMS) for retrieving video moments with a sketch-based query interface. This\nnovel interface allows users to specify object trajectory events with simple\nmouse drag-and-drop operations. Users can use trajectories of single objects as\nbuilding blocks to compose complex events. Using a pre-trained model that\nencodes trajectory similarity, SketchQL achieves zero-shot video moments\nretrieval by performing similarity searches over the video to identify clips\nthat are the most similar to the visual query. In this demonstration, we\nintroduce the graphic user interface of SketchQL and detail its functionalities\nand interaction mechanisms. We also demonstrate the end-to-end usage of\nSketchQL from query composition to video moments retrieval using real-world\nscenarios.\n","authors":["Renzhi Wu","Pramod Chunduri","Dristi J Shah","Ashmitha Julius Aravind","Ali Payani","Xu Chu","Joy Arulraj","Kexin Rong"],"pdf_url":"https://arxiv.org/pdf/2405.18334v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19602v2","updated":"2024-07-01T02:10:16Z","published":"2024-06-28T02:18:16Z","title":"A Survey on Deep Clustering: From the Prior Perspective","summary":" Facilitated by the powerful feature extraction ability of neural networks,\ndeep clustering has achieved great success in analyzing high-dimensional and\ncomplex real-world data. The performance of deep clustering methods is affected\nby various factors such as network structures and learning objectives. However,\nas pointed out in this survey, the essence of deep clustering lies in the\nincorporation and utilization of prior knowledge, which is largely ignored by\nexisting works. From pioneering deep clustering methods based on data structure\nassumptions to recent contrastive clustering methods based on data augmentation\ninvariances, the development of deep clustering intrinsically corresponds to\nthe evolution of prior knowledge. In this survey, we provide a comprehensive\nreview of deep clustering methods by categorizing them into six types of prior\nknowledge. We find that in general the prior innovation follows two trends,\nnamely, i) from mining to constructing, and ii) from internal to external.\nBesides, we provide a benchmark on five widely-used datasets and analyze the\nperformance of methods with diverse priors. By providing a novel prior\nknowledge perspective, we hope this survey could provide some novel insights\nand inspire future research in the deep clustering community.\n","authors":["Yiding Lu","Haobin Li","Yunfan Li","Yijie Lin","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2406.19602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11829v3","updated":"2024-07-01T02:06:42Z","published":"2023-10-18T09:31:21Z","title":"Towards Graph Foundation Models: A Survey and Beyond","summary":" Foundation models have emerged as critical components in a variety of\nartificial intelligence applications, and showcase significant success in\nnatural language processing and several other domains. Meanwhile, the field of\ngraph machine learning is witnessing a paradigm transition from shallow methods\nto more sophisticated deep learning approaches. The capabilities of foundation\nmodels to generalize and adapt motivate graph machine learning researchers to\ndiscuss the potential of developing a new graph learning paradigm. This\nparadigm envisions models that are pre-trained on extensive graph data and can\nbe adapted for various graph tasks. Despite this burgeoning interest, there is\na noticeable lack of clear definitions and systematic analyses pertaining to\nthis new domain. To this end, this article introduces the concept of Graph\nFoundation Models (GFMs), and offers an exhaustive explanation of their key\ncharacteristics and underlying technologies. We proceed to classify the\nexisting work related to GFMs into three distinct categories, based on their\ndependence on graph neural networks and large language models. In addition to\nproviding a thorough review of the current state of GFMs, this article also\noutlooks potential avenues for future research in this rapidly evolving domain.\n","authors":["Jiawei Liu","Cheng Yang","Zhiyuan Lu","Junze Chen","Yibo Li","Mengmei Zhang","Ting Bai","Yuan Fang","Lichao Sun","Philip S. Yu","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2310.11829v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01856v1","updated":"2024-07-01T23:56:56Z","published":"2024-07-01T23:56:56Z","title":"Adaptive RKHS Fourier Features for Compositional Gaussian Process Models","summary":" Deep Gaussian Processes (DGPs) leverage a compositional structure to model\nnon-stationary processes. DGPs typically rely on local inducing point\napproximations across intermediate GP layers. Recent advances in DGP inference\nhave shown that incorporating global Fourier features from Reproducing Kernel\nHilbert Space (RKHS) can enhance the DGPs' capability to capture complex\nnon-stationary patterns. This paper extends the use of these features to\ncompositional GPs involving linear transformations. In particular, we introduce\nOrdinary Differential Equation (ODE) -based RKHS Fourier features that allow\nfor adaptive amplitude and phase modulation through convolution operations.\nThis convolutional formulation relates our work to recently proposed deep\nlatent force models, a multi-layer structure designed for modelling nonlinear\ndynamical systems. By embedding these adjustable RKHS Fourier features within a\ndoubly stochastic variational inference framework, our model exhibits improved\npredictive performance across various regression tasks.\n","authors":["Xinxing Shi","Thomas Baldwin-McDonald","Mauricio A. Álvarez"],"pdf_url":"https://arxiv.org/pdf/2407.01856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01853v1","updated":"2024-07-01T23:47:09Z","published":"2024-07-01T23:47:09Z","title":"Improving Multilingual Instruction Finetuning via Linguistically Natural\n and Diverse Datasets","summary":" Advancements in Large Language Models (LLMs) have significantly enhanced\ninstruction-following capabilities. However, most Instruction Fine-Tuning (IFT)\ndatasets are predominantly in English, limiting model performance in other\nlanguages. Traditional methods for creating multilingual IFT datasets such as\ntranslating existing English IFT datasets or converting existing NLP datasets\ninto IFT datasets by templating, struggle to capture linguistic nuances and\nensure prompt (instruction) diversity. To address this issue, we propose a\nnovel method for collecting multilingual IFT datasets that preserves linguistic\nnaturalness and ensures prompt diversity. This approach leverages\nEnglish-focused LLMs, monolingual corpora, and a scoring function to create\nhigh-quality, diversified IFT datasets in multiple languages. Experiments\ndemonstrate that LLMs finetuned using these IFT datasets show notable\nimprovements in both generative and discriminative tasks, indicating enhanced\nlanguage comprehension by LLMs in non-English contexts. Specifically, on the\nmultilingual summarization task, LLMs using our IFT dataset achieved 17.57% and\n15.23% improvements over LLMs fine-tuned with translation-based and\ntemplate-based datasets, respectively.\n","authors":["Sathish Reddy Indurthi","Wenxuan Zhou","Shamil Chollampatt","Ravi Agrawal","Kaiqiang Song","Lingxiao Zhao","Chenguang Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.01853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01851v1","updated":"2024-07-01T23:32:25Z","published":"2024-07-01T23:32:25Z","title":"Meerkat: Audio-Visual Large Language Model for Grounding in Space and\n Time","summary":" Leveraging Large Language Models' remarkable proficiency in text-based tasks,\nrecent works on Multi-modal LLMs (MLLMs) extend them to other modalities like\nvision and audio. However, the progress in these directions has been mostly\nfocused on tasks that only require a coarse-grained understanding of the\naudio-visual semantics. We present Meerkat, an audio-visual LLM equipped with a\nfine-grained understanding of image and audio both spatially and temporally.\nWith a new modality alignment module based on optimal transport and a\ncross-attention module that enforces audio-visual consistency, Meerkat can\ntackle challenging tasks such as audio referred image grounding, image guided\naudio temporal localization, and audio-visual fact-checking. Moreover, we\ncarefully curate a large dataset AVFIT that comprises 3M instruction tuning\nsamples collected from open-source datasets, and introduce MeerkatBench that\nunifies five challenging audio-visual tasks. We achieve state-of-the-art\nperformance on all these downstream tasks with a relative improvement of up to\n37.12%.\n","authors":["Sanjoy Chowdhury","Sayan Nag","Subhrajyoti Dasgupta","Jun Chen","Mohamed Elhoseiny","Ruohan Gao","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2407.01851v1.pdf","comment":"Accepted at ECCV 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.14485v3","updated":"2024-07-01T16:49:09Z","published":"2024-06-20T16:48:14Z","title":"Proceedings of The second international workshop on eXplainable AI for\n the Arts (XAIxArts)","summary":" This second international workshop on explainable AI for the Arts (XAIxArts)\nbrought together a community of researchers in HCI, Interaction Design, AI,\nexplainable AI (XAI), and digital arts to explore the role of XAI for the Arts.\nWorkshop held at the 16th ACM Conference on Creativity and Cognition (C&C\n2024), Chicago, USA.\n","authors":["Nick Bryan-Kinns","Corey Ford","Shuoyang Zheng","Helen Kennedy","Alan Chamberlain","Makayla Lewis","Drew Hemment","Zijin Li","Qiong Wu","Lanxi Xiao","Gus Xia","Jeba Rezwana","Michael Clemens","Gabriel Vigliensoni"],"pdf_url":"https://arxiv.org/pdf/2406.14485v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00925v1","updated":"2024-07-01T03:16:51Z","published":"2024-07-01T03:16:51Z","title":"SIDQL: An Efficient Keyframe Extraction and Motion Reconstruction\n Framework in Motion Capture","summary":" Metaverse, which integrates the virtual and physical worlds, has emerged as\nan innovative paradigm for changing people's lifestyles. Motion capture has\nbecome a reliable approach to achieve seamless synchronization of the movements\nbetween avatars and human beings, which plays an important role in diverse\nMetaverse applications. However, due to the continuous growth of data, current\ncommunication systems face a significant challenge of meeting the demand of\nultra-low latency during application. In addition, current methods also have\nshortcomings when selecting keyframes, e.g., relying on recognizing motion\ntypes and artificially selected keyframes. Therefore, the utilization of\nkeyframe extraction and motion reconstruction techniques could be considered a\nfeasible and promising solution. In this work, a new motion reconstruction\nalgorithm is designed in a spherical coordinate system involving location and\nvelocity information. Then, we formalize the keyframe extraction problem into\nan optimization problem to reduce the reconstruction error. Using Deep\nQ-Learning (DQL), the Spherical Interpolation based Deep Q-Learning (SIDQL)\nframework is proposed to generate proper keyframes for reconstructing the\nmotion sequences. We use the CMU database to train and evaluate the framework.\nOur scheme can significantly reduce the data volume and transmission latency\ncompared to various baselines while maintaining a reconstruction error of less\nthan 0.09 when extracting five keyframes.\n","authors":["Xuling Zhang","Ziru Zhang","Yuyang Wang","Lik-hang Lee","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2407.00925v1.pdf","comment":null}]},"2024-06-30T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2403.19708v3","updated":"2024-06-30T23:50:38Z","published":"2024-03-23T10:42:49Z","title":"Cost-Efficient Large Language Model Serving for Multi-turn Conversations\n with CachedAttention","summary":" Interacting with humans through multi-turn conversations is a fundamental\nfeature of large language models (LLMs). However, existing LLM serving engines\nexecuting multi-turn conversations are inefficient due to the need to\nrepeatedly compute the key-value (KV) caches of historical tokens, incurring\nhigh serving costs. To address the problem, this paper proposes\nCachedAttention, a new attention mechanism that enables reuse of KV caches\nacross multi-turn conversations, significantly reducing the repetitive\ncomputation overheads. CachedAttention maintains a hierarchical KV caching\nsystem that leverages cost-effective memory/storage mediums to save KV caches\nfor all requests. To reduce KV cache access overheads from slow mediums,\nCachedAttention employs layer-wise pre-loading and asynchronous saving schemes\nto overlap the KV cache access with the GPU computation. To ensure that the KV\ncaches to be accessed are placed in the fastest hierarchy, CachedAttention\nemploys scheduler-aware fetching and eviction schemes to consciously place the\nKV caches in different layers based on the hints from the inference job\nscheduler. To avoid the invalidation of the saved KV caches incurred by context\nwindow overflow, CachedAttention enables the saved KV caches to remain valid\nvia decoupling the positional encoding and effectively truncating the KV\ncaches. Extensive experimental results demonstrate that CachedAttention\nsignificantly decreases the time to the first token (TTFT) by up to 87%,\nimproves the prompt prefilling throughput by up to 7.8$\\times$ for multi-turn\nconversations, and reduces the end-to-end inference cost by up to 70%.\n","authors":["Bin Gao","Zhuomin He","Puru Sharma","Qingxuan Kang","Djordje Jevdjic","Junbo Deng","Xingkun Yang","Zhou Yu","Pengfei Zuo"],"pdf_url":"https://arxiv.org/pdf/2403.19708v3.pdf","comment":"Accepted to USENIX Annual Technical Conference (ATC) 2024"},{"id":"http://arxiv.org/abs/2311.12023v3","updated":"2024-06-30T22:43:35Z","published":"2023-11-20T18:57:41Z","title":"LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient\n Language Model Finetuning","summary":" We propose a simple approach for memory-efficient adaptation of pretrained\nlanguage models. Our approach uses an iterative algorithm to decompose each\npretrained matrix into a high-precision low-rank component and a\nmemory-efficient quantized component. During finetuning, the quantized\ncomponent remains fixed and only the low-rank component is updated. We present\nan integer linear programming formulation of the quantization component which\nenables dynamic configuration of quantization parameters (e.g., bit-width,\nblock size) for each matrix given an overall target memory budget. We further\nexplore a data-aware version of the algorithm which uses an approximation of\nthe Fisher information matrix to weight the reconstruction objective during\nmatrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and\n70B) demonstrate that our low-rank plus quantized matrix decomposition approach\n(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables\naggressive quantization to sub-3 bits with only minor performance degradations.\nWhen finetuned on a language modeling calibration dataset, LQ-LoRA can also be\nused for model compression; in this setting our 2.75-bit LLaMA-2-70B model\n(which has 2.85 bits on average when including the low-rank components and\nrequires 27GB of GPU memory) performs respectably compared to the 16-bit\nbaseline.\n","authors":["Han Guo","Philip Greengard","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12023v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17911v2","updated":"2024-06-30T21:53:56Z","published":"2024-06-25T19:52:01Z","title":"X-ray Made Simple: Radiology Report Generation and Evaluation with\n Layman's Terms","summary":" Radiology Report Generation (RRG) has achieved significant progress with the\nadvancements of multimodal generative models. However, the evaluation in the\ndomain suffers from a lack of fair and robust metrics. We reveal that, high\nperformance on RRG with existing lexical-based metrics (e.g. BLEU) might be\nmore of a mirage - a model can get a high BLEU only by learning the template of\nreports. This has become an urgent problem for RRG due to the highly\npatternized nature of these reports. In this work, we un-intuitively approach\nthis problem by proposing the Layman's RRG framework, a layman's terms-based\ndataset, evaluation and training framework that systematically improves RRG\nwith day-to-day language. We first contribute the translated Layman's terms\ndataset. Building upon the dataset, we then propose a semantics-based\nevaluation method, which is proved to mitigate the inflated numbers of BLEU and\nprovides fairer evaluation. Last, we show that training on the layman's terms\ndataset encourages models to focus on the semantics of the reports, as opposed\nto overfitting to learning the report templates. We reveal a promising scaling\nlaw between the number of training examples and semantics gain provided by our\ndataset, compared to the inverse pattern brought by the original formats. Our\ncode is available at \\url{https://github.com/hegehongcha/LaymanRRG}.\n","authors":["Kun Zhao","Chenghao Xiao","Chen Tang","Bohao Yang","Kai Ye","Noura Al Moubayed","Liang Zhan","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2406.17911v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17962v2","updated":"2024-06-30T21:15:47Z","published":"2024-06-25T22:44:17Z","title":"SimsChat: A Customisable Persona-Driven Role-Playing Agent","summary":" Large Language Models (LLMs) possess the remarkable capability to understand\nhuman instructions and generate high-quality text, enabling them to act as\nagents that simulate human behaviours. This capability allows LLMs to emulate\nhuman beings in a more advanced manner, beyond merely replicating simple human\nbehaviours. However, there is a lack of exploring into leveraging LLMs to craft\ncharacters from several aspects. In this work, we introduce the Customisable\nConversation Agent Framework, which employs LLMs to simulate real-world\ncharacters that can be freely customised according to different user\npreferences. The customisable framework is helpful for designing customisable\ncharacters and role-playing agents according to human's preferences. We first\npropose the SimsConv dataset, which comprises 68 different customised\ncharacters, 1,360 multi-turn role-playing dialogues, and encompasses 13,971\ninteraction dialogues in total. The characters are created from several\nreal-world elements, such as career, aspiration, trait, and skill. Building on\nthese foundations, we present SimsChat, a freely customisable role-playing\nagent. It incorporates different real-world scenes and topic-specific character\ninteraction dialogues, simulating characters' life experiences in various\nscenarios and topic-specific interactions with specific emotions. Experimental\nresults show that our proposed framework achieves desirable performance and\nprovides helpful guideline for building better simulacra of human beings in the\nfuture. Our data and code are available at\nhttps://github.com/Bernard-Yang/SimsChat.\n","authors":["Bohao Yang","Dong Liu","Chen Tang","Chenghao Xiao","Kun Zhao","Chao Li","Lin Yuan","Guang Yang","Lanxiao Huang","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2406.17962v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19721v2","updated":"2024-06-30T20:54:57Z","published":"2024-04-30T17:11:54Z","title":"PANGeA: Procedural Artificial Narrative using Generative AI for\n Turn-Based Video Games","summary":" This research introduces Procedural Artificial Narrative using Generative AI\n(PANGeA), a structured approach for leveraging large language models (LLMs),\nguided by a game designer's high-level criteria, to generate narrative content\nfor turn-based role-playing video games (RPGs). Distinct from prior\napplications of LLMs used for video game design, PANGeA innovates by not only\ngenerating game level data (which includes, but is not limited to, setting, key\nitems, and non-playable characters (NPCs)), but by also fostering dynamic,\nfree-form interactions between the player and the environment that align with\nthe procedural game narrative. The NPCs generated by PANGeA are\npersonality-biased and express traits from the Big 5 Personality Model in their\ngenerated responses. PANGeA addresses challenges behind ingesting free-form\ntext input, which can prompt LLM responses beyond the scope of the game\nnarrative. A novel validation system that uses the LLM's intelligence evaluates\ntext input and aligns generated responses with the unfolding narrative. Making\nthese interactions possible, PANGeA is supported by a server that hosts a\ncustom memory system that supplies context for augmenting generated responses\nthus aligning them with the procedural narrative. For its broad application,\nthe server has a REST interface enabling any game engine to integrate directly\nwith PANGeA, as well as an LLM interface adaptable with local or private LLMs.\nPANGeA's ability to foster dynamic narrative generation by aligning responses\nwith the procedural narrative is demonstrated through an empirical study and\nablation test of two versions of a demo game. These are, a custom,\nbrowser-based GPT and a Unity demo. As the results show, PANGeA holds potential\nto assist game designers in using LLMs to generate narrative-consistent content\neven when provided varied and unpredictable, free-form text input.\n","authors":["Steph Buongiorno","Lawrence Jake Klinkert","Tanishq Chawla","Zixin Zhuang","Corey Clark"],"pdf_url":"https://arxiv.org/pdf/2404.19721v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04886v2","updated":"2024-06-30T18:53:22Z","published":"2023-11-08T18:46:32Z","title":"SEMQA: Semi-Extractive Multi-Source Question Answering","summary":" Recently proposed long-form question answering (QA) systems, supported by\nlarge language models (LLMs), have shown promising capabilities. Yet,\nattributing and verifying their generated abstractive answers can be difficult,\nand automatically evaluating their accuracy remains an ongoing challenge.\n In this work, we introduce a new QA task for answering multi-answer questions\nby summarizing multiple diverse sources in a semi-extractive fashion.\nSpecifically, Semi-extractive Multi-source QA (SEMQA) requires models to output\na comprehensive answer, while mixing factual quoted spans -- copied verbatim\nfrom given input sources -- and non-factual free-text connectors that glue\nthese spans together into a single cohesive passage. This setting bridges the\ngap between the outputs of well-grounded but constrained extractive QA systems\nand more fluent but harder to attribute fully abstractive answers.\nParticularly, it enables a new mode for language models that leverages their\nadvanced language generation capabilities, while also producing fine in-line\nattributions by-design that are easy to verify, interpret, and evaluate.\n To study this task, we create the first dataset of this kind, QuoteSum, with\nhuman-written semi-extractive answers to natural and generated questions, and\ndefine text-based evaluation metrics. Experimenting with several LLMs in\nvarious settings, we find this task to be surprisingly challenging,\ndemonstrating the importance of QuoteSum for developing and studying such\nconsolidation capabilities.\n","authors":["Tal Schuster","Adam D. Lelkes","Haitian Sun","Jai Gupta","Jonathan Berant","William W. Cohen","Donald Metzler"],"pdf_url":"https://arxiv.org/pdf/2311.04886v2.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2406.07588v2","updated":"2024-06-30T18:19:25Z","published":"2024-06-11T08:12:43Z","title":"AIM: Let Any Multi-modal Large Language Models Embrace Efficient\n In-Context Learning","summary":" In-context learning (ICL) facilitates Large Language Models (LLMs) exhibiting\nemergent ability on downstream tasks without updating billions of parameters.\nHowever, in the area of multi-modal Large Language Models (MLLMs), two problems\nhinder the application of multi-modal ICL: (1) Most primary MLLMs are only\ntrained on single-image datasets, making them unable to read multi-modal\ndemonstrations. (2) With the demonstrations increasing, thousands of visual\ntokens highly challenge hardware and degrade ICL performance. During\npreliminary explorations, we discovered that the inner LLM tends to focus more\non the linguistic modality within multi-modal demonstrations to generate\nresponses. Therefore, we propose a general and light-weighted framework\n\\textbf{AIM} to tackle the mentioned problems through \\textbf{A}ggregating\n\\textbf{I}mage information of \\textbf{M}ultimodal demonstrations to the dense\nlatent space of the corresponding linguistic part. Specifically, AIM first uses\nthe frozen backbone MLLM to read each image-text demonstration and extracts the\nvector representations on top of the text. These vectors naturally fuse the\ninformation of the image-text pair, and AIM transforms them into fused virtual\ntokens acceptable for the inner LLM via a trainable projection layer.\nUltimately, these fused tokens function as variants of multi-modal\ndemonstrations, fed into the MLLM to direct its response to the current query\nas usual. Because these fused tokens stem from the textual component of the\nimage-text pair, a multi-modal demonstration is nearly reduced to a pure\ntextual demonstration, thus seamlessly applying to any MLLMs. With its de facto\nMLLM frozen, AIM is parameter-efficient and we train it on public multi-modal\nweb corpora which have nothing to do with downstream test tasks.\n","authors":["Jun Gao","Qian Qiao","Ziqiang Cao","Zili Wang","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2406.07588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12036v4","updated":"2024-06-30T15:12:10Z","published":"2024-06-17T19:07:21Z","title":"MedCalc-Bench: Evaluating Large Language Models for Medical Calculations","summary":" As opposed to evaluating computation and logic-based reasoning, current\nbenchmarks for evaluating large language models (LLMs) in medicine are\nprimarily focused on question-answering involving domain knowledge and\ndescriptive reasoning. While such qualitative capabilities are vital to medical\ndiagnosis, in real-world scenarios, doctors frequently use clinical calculators\nthat follow quantitative equations and rule-based reasoning paradigms for\nevidence-based decision support. To this end, we propose MedCalc-Bench, a\nfirst-of-its-kind dataset focused on evaluating the medical calculation\ncapability of LLMs. MedCalc-Bench contains an evaluation set of over 1000\nmanually reviewed instances from 55 different medical calculation tasks. Each\ninstance in MedCalc-Bench consists of a patient note, a question requesting to\ncompute a specific medical value, a ground truth answer, and a step-by-step\nexplanation showing how the answer is obtained. While our evaluation results\nshow the potential of LLMs in this area, none of them are effective enough for\nclinical settings. Common issues include extracting the incorrect entities, not\nusing the correct equation or rules for a calculation task, or incorrectly\nperforming the arithmetic for the computation. We hope our study highlights the\nquantitative knowledge and reasoning gaps in LLMs within medical settings,\nencouraging future improvements of LLMs for various clinical calculation tasks.\n","authors":["Nikhil Khandekar","Qiao Jin","Guangzhi Xiong","Soren Dunn","Serina S Applebaum","Zain Anwar","Maame Sarfo-Gyamfi","Conrad W Safranek","Abid A Anwar","Andrew Zhang","Aidan Gilson","Maxwell B Singer","Amisha Dave","Andrew Taylor","Aidong Zhang","Qingyu Chen","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2406.12036v4.pdf","comment":"Github link: https://github.com/ncbi-nlp/MedCalc-Bench HuggingFace\n link: https://huggingface.co/datasets/nsk7153/MedCalc-Bench"},{"id":"http://arxiv.org/abs/2406.16858v2","updated":"2024-06-30T15:03:25Z","published":"2024-06-24T17:59:11Z","title":"EAGLE-2: Faster Inference of Language Models with Dynamic Draft Trees","summary":" Inference with modern Large Language Models (LLMs) is expensive and\ntime-consuming, and speculative sampling has proven to be an effective\nsolution. Most speculative sampling methods such as EAGLE use a static draft\ntree, implicitly assuming that the acceptance rate of draft tokens depends only\non their position. Interestingly, we found that the acceptance rate of draft\ntokens is also context-dependent. In this paper, building upon EAGLE, we\npropose EAGLE-2, which introduces a new technique of context-aware dynamic\ndraft tree into drafting modeling. This improvement leverages the fact that the\ndraft model of EAGLE is well-calibrated: the confidence scores from the draft\nmodel approximate acceptance rates with small errors. We conducted extensive\nevaluations on three series of LLMs and six tasks, with EAGLE-2 achieving\nspeedup ratios 3.05x-4.26x, which is 20%-40% faster than EAGLE-1. EAGLE-2 also\nensures that the distribution of the generated text remains unchanged, making\nit a lossless acceleration algorithm.\n","authors":["Yuhui Li","Fangyun Wei","Chao Zhang","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.16858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15319v2","updated":"2024-06-30T15:01:36Z","published":"2024-06-21T17:23:21Z","title":"LongRAG: Enhancing Retrieval-Augmented Generation with Long-context LLMs","summary":" In traditional RAG framework, the basic retrieval units are normally short.\nThe common retrievers like DPR normally work with 100-word Wikipedia\nparagraphs. Such a design forces the retriever to search over a large corpus to\nfind the `needle' unit. In contrast, the readers only need to extract answers\nfrom the short retrieved units. Such an imbalanced `heavy' retriever and\n`light' reader design can lead to sub-optimal performance. In order to\nalleviate the imbalance, we propose a new framework LongRAG, consisting of a\n`long retriever' and a `long reader'. LongRAG processes the entire Wikipedia\ninto 4K-token units, which is 30x longer than before. By increasing the unit\nsize, we significantly reduce the total units from 22M to 700K. This\nsignificantly lowers the burden of retriever, which leads to a remarkable\nretrieval score: answer recall@1=71% on NQ (previously 52%) and answer\nrecall@2=72% (previously 47%) on HotpotQA (full-wiki). Then we feed the top-k\nretrieved units ($\\approx$ 30K tokens) to an existing long-context LLM to\nperform zero-shot answer extraction. Without requiring any training, LongRAG\nachieves an EM of 62.7% on NQ, which is the best known result. LongRAG also\nachieves 64.3% on HotpotQA (full-wiki), which is on par of the SoTA model. Our\nstudy offers insights into the future roadmap for combining RAG with\nlong-context LLMs.\n","authors":["Ziyan Jiang","Xueguang Ma","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2406.15319v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2406.11201v2","updated":"2024-06-30T14:42:52Z","published":"2024-06-17T04:35:17Z","title":"Fine-Tuning or Fine-Failing? Debunking Performance Myths in Large\n Language Models","summary":" Large Language Models (LLMs) have the unique capability to understand and\ngenerate human-like text from input queries. When fine-tuned, these models show\nenhanced performance on domain-specific queries. OpenAI highlights the process\nof fine-tuning, stating: \"To fine-tune a model, you are required to provide at\nleast 10 examples. We typically see clear improvements from fine-tuning on 50\nto 100 training examples, but the right number varies greatly based on the\nexact use case.\" This study extends this concept to the integration of LLMs\nwithin Retrieval-Augmented Generation (RAG) pipelines, which aim to improve\naccuracy and relevance by leveraging external corpus data for information\nretrieval. However, RAG's promise of delivering optimal responses often falls\nshort in complex query scenarios. This study aims to specifically examine the\neffects of fine-tuning LLMs on their ability to extract and integrate\ncontextual data to enhance the performance of RAG systems across multiple\ndomains. We evaluate the impact of fine-tuning on the LLMs' capacity for data\nextraction and contextual understanding by comparing the accuracy and\ncompleteness of fine-tuned models against baseline performances across datasets\nfrom multiple domains. Our findings indicate that fine-tuning resulted in a\ndecline in performance compared to the baseline models, contrary to the\nimprovements observed in standalone LLM applications as suggested by OpenAI.\nThis study highlights the need for vigorous investigation and validation of\nfine-tuned models for domain-specific tasks.\n","authors":["Scott Barnett","Zac Brannelly","Stefanus Kurniawan","Sheng Wong"],"pdf_url":"https://arxiv.org/pdf/2406.11201v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.14024v2","updated":"2024-06-30T13:44:04Z","published":"2024-06-20T06:42:27Z","title":"LLM Critics Help Catch Bugs in Mathematics: Towards a Better\n Mathematical Verifier with Natural Language Feedback","summary":" Mathematical verfier achieves success in mathematical reasoning tasks by\nvalidating the correctness of solutions. However, existing verifiers are\ntrained with binary classification labels, which are not informative enough for\nthe model to accurately assess the solutions. To mitigate the aforementioned\ninsufficiency of binary labels, we introduce step-wise natural language\nfeedbacks as rationale labels (i.e., the correctness of the current step and\nthe explanations). In this paper, we propose \\textbf{Math-Minos}, a natural\nlanguage feedback enhanced verifier by constructing automatically-generated\ntraining data and a two-stage training paradigm for effective training and\nefficient inference. Our experiments reveal that a small set (30k) of natural\nlanguage feedbacks can significantly boost the performance of the verifier by\nthe accuracy of 1.6\\% (86.6\\% $\\rightarrow$ 88.2\\%) on GSM8K and 0.8\\% (37.8\\%\n$\\rightarrow$ 38.6\\%) on MATH. We have released our code and data for further\nexploration.\n","authors":["Bofei Gao","Zefan Cai","Runxin Xu","Peiyi Wang","Ce Zheng","Runji Lin","Keming Lu","Junyang Lin","Chang Zhou","Wen Xiao","Junjie Hu","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2406.14024v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2406.11629v3","updated":"2024-06-30T13:31:24Z","published":"2024-06-17T15:11:58Z","title":"Can Many-Shot In-Context Learning Help Long-Context LLM Judges? See\n More, Judge Better!","summary":" Leveraging Large Language Models (LLMs) as judges for judging the performance\nof LLMs has recently garnered attention. However, this type of approach is\naffected by the potential biases in LLMs, raising concerns about the\nreliability of the evaluation results. To mitigate this issue, we propose and\nstudy two versions of many-shot in-context prompts, which rely on two existing\nsettings of many-shot ICL for helping GPT-4o-as-a-Judge in single answer\ngrading to mitigate the potential biases in LLMs, Reinforced ICL and\nUnsupervised ICL. Concretely, the former utilizes in-context examples with\nmodel-generated rationales, and the latter without. Based on the designed\nprompts, we investigate the impact of scaling the number of in-context examples\non the consistency and quality of the judgment results. Furthermore, we reveal\nthe symbol bias hidden in the pairwise comparison of GPT-4o-as-a-Judge and\npropose a simple yet effective approach to mitigate it. Experimental results\nshow that advanced long-context LLMs, such as GPT-4o, perform better in the\nmany-shot regime than in the zero-shot regime. Meanwhile, the experimental\nresults further verify the effectiveness of the symbol bias mitigation\napproach.\n","authors":["Mingyang Song","Mao Zheng","Xuan Luo"],"pdf_url":"https://arxiv.org/pdf/2406.11629v3.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2406.14314v2","updated":"2024-06-30T12:33:48Z","published":"2024-06-20T13:46:10Z","title":"Identifying User Goals from UI Trajectories","summary":" Autonomous agents that interact with graphical user interfaces (GUIs) hold\nsignificant potential for enhancing user experiences. To further improve these\nexperiences, agents need to be personalized and proactive. By effectively\ncomprehending user intentions through their actions and interactions with GUIs,\nagents will be better positioned to achieve these goals. This paper introduces\nthe task of goal identification from observed UI trajectories, aiming to infer\nthe user's intended task based on their GUI interactions. We propose a novel\nevaluation metric to assess whether two task descriptions are paraphrases\nwithin a specific UI environment. By Leveraging the inverse relation with the\nUI automation task, we utilized the Android-In-The-Wild and Mind2Web datasets\nfor our experiments. Using our metric and these datasets, we conducted several\nexperiments comparing the performance of humans and state-of-the-art models,\nspecifically GPT-4 and Gemini-1.5 Pro. Our results show that Gemini performs\nbetter than GPT but still underperforms compared to humans, indicating\nsignificant room for improvement.\n","authors":["Omri Berkovitch","Sapir Caduri","Noam Kahlon","Anatoly Efros","Avi Caciularu","Ido Dagan"],"pdf_url":"https://arxiv.org/pdf/2406.14314v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17524v3","updated":"2024-06-30T10:58:28Z","published":"2024-04-26T16:41:00Z","title":"On the Use of Large Language Models to Generate Capability Ontologies","summary":" Capability ontologies are increasingly used to model functionalities of\nsystems or machines. The creation of such ontological models with all\nproperties and constraints of capabilities is very complex and can only be done\nby ontology experts. However, Large Language Models (LLMs) have shown that they\ncan generate machine-interpretable models from natural language text input and\nthus support engineers / ontology experts. Therefore, this paper investigates\nhow LLMs can be used to create capability ontologies. We present a study with a\nseries of experiments in which capabilities with varying complexities are\ngenerated using different prompting techniques and with different LLMs. Errors\nin the generated ontologies are recorded and compared. To analyze the quality\nof the generated ontologies, a semi-automated approach based on RDF syntax\nchecking, OWL reasoning, and SHACL constraints is used. The results of this\nstudy are very promising because even for complex capabilities, the generated\nontologies are almost free of errors.\n","authors":["Luis Miguel Vieira da Silva","Aljosha Köcher","Felix Gehlhoff","Alexander Fay"],"pdf_url":"https://arxiv.org/pdf/2404.17524v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00376v2","updated":"2024-06-30T09:24:12Z","published":"2024-03-30T14:09:00Z","title":"Small Language Models Learn Enhanced Reasoning Skills from Medical\n Textbooks","summary":" While recent advancements in commercial large language models (LM) have shown\npromising results in medical tasks, their closed-source nature poses\nsignificant privacy and security concerns, hindering their widespread use in\nthe medical field. Despite efforts to create open-source models, their limited\nparameters often result in insufficient multi-step reasoning capabilities\nrequired for solving complex medical problems. To address this, we introduce\nMeerkat, a new family of medical AI systems ranging from 7 to 70 billion\nparameters. The models were trained using our new synthetic dataset consisting\nof high-quality chain-of-thought reasoning paths sourced from 18 medical\ntextbooks, along with diverse instruction-following datasets. Our systems\nachieved remarkable accuracy across six medical benchmarks, surpassing the\nprevious best models such as MediTron and BioMistral, and GPT-3.5 by a large\nmargin. Notably, Meerkat-7B surpassed the passing threshold of the United\nStates Medical Licensing Examination (USMLE) for the first time for a\n7B-parameter model, while Meerkat-70B outperformed GPT-4 by an average of 1.3%.\nAdditionally, Meerkat-70B correctly diagnosed 21 out of 38 complex clinical\ncases, outperforming humans' 13.8 and closely matching GPT-4's 21.8. Our\nsystems offered more detailed free-form responses to clinical queries compared\nto existing small models, approaching the performance level of large commercial\nmodels. This significantly narrows the performance gap with large LMs,\nshowcasing its effectiveness in addressing complex medical challenges.\n","authors":["Hyunjae Kim","Hyeon Hwang","Jiwoo Lee","Sihyeon Park","Dain Kim","Taewhoo Lee","Chanwoong Yoon","Jiwoong Sohn","Donghee Choi","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2404.00376v2.pdf","comment":"Added new LLaMA-3-based models and experiments on NEJM case\n challenges"},{"id":"http://arxiv.org/abs/2406.10621v2","updated":"2024-06-30T09:02:34Z","published":"2024-06-15T12:48:00Z","title":"StrucText-Eval: An Autogenerated Benchmark for Evaluating Large Language\n Model's Ability in Structure-Rich Text Understanding","summary":" Given the substantial volumes of structured data held by many companies,\nenabling Large Language Models (LLMs) to directly understand structured text in\nnon-structured forms could significantly enhance their capabilities across\nvarious business scenarios. To this end, we propose evaluation data generation\nmethod for assessing LLM's ability in understanding the structure-rich text,\nwhich generates structured data of controllable complexity based on manually\ncrafted question templates and generation rules. Building on this generation\nmethod, we introduce StrucText-Eval, a benchmark comprising 6,032 questions\nacross 8 different structured languages and 29 specific tasks. Furthermore,\nconsidering human proficiency in rule-based tasks, we also present\nStrucText-Eval-Hard, which includes 3,016 questions designed to further examine\nthe gap between LLMs and human performance. Results indicate that the\nbest-performing LLM currently achieve an accuracy of 65.0\\% on\nStrucText-Eval-Hard, while human accuracy reaches up to 95.7\\%. Moreover, while\nfine-tuning using StrucText-Eval can enhance existing LLMs' understanding of\nall structured languages, it does not necessarily improve performance across\nall task types. The benchmark and generation codes are open sourced in\nhttps://github.com/MikeGu721/StrucText-Eval\n","authors":["Zhouhong Gu","Haoning Ye","Zeyang Zhou","Hongwei Feng","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2406.10621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14825v3","updated":"2024-06-30T08:54:37Z","published":"2024-06-21T01:52:37Z","title":"TemPrompt: Multi-Task Prompt Learning for Temporal Relation Extraction\n in RAG-based Crowdsourcing Systems","summary":" Temporal relation extraction (TRE) aims to grasp the evolution of events or\nactions, and thus shape the workflow of associated tasks, so it holds promise\nin helping understand task requests initiated by requesters in crowdsourcing\nsystems. However, existing methods still struggle with limited and unevenly\ndistributed annotated data. Therefore, inspired by the abundant global\nknowledge stored within pre-trained language models (PLMs), we propose a\nmulti-task prompt learning framework for TRE (TemPrompt), incorporating prompt\ntuning and contrastive learning to tackle these issues. To elicit more\neffective prompts for PLMs, we introduce a task-oriented prompt construction\napproach that thoroughly takes the myriad factors of TRE into consideration for\nautomatic prompt generation. In addition, we present temporal event reasoning\nas a supplement to bolster the model's focus on events and temporal cues. The\nexperimental results demonstrate that TemPrompt outperforms all compared\nbaselines across the majority of metrics under both standard and few-shot\nsettings. A case study is provided to validate its effectiveness in\ncrowdsourcing scenarios.\n","authors":["Jing Yang","Yu Zhao","Yang Linyao","Xiao Wang","Long Chen","Fei-Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2406.14825v3.pdf","comment":"I submitted the manuscript without obtaining consent from all\n co-authors"},{"id":"http://arxiv.org/abs/2405.13144v2","updated":"2024-06-30T05:42:24Z","published":"2024-05-21T18:29:54Z","title":"Mamo: a Mathematical Modeling Benchmark with Solvers","summary":" Mathematical modeling involves representing real-world phenomena, systems, or\nproblems using mathematical expressions and equations to analyze, understand,\nand predict their behavior. Given that this process typically requires\nexperienced experts, there is an interest in exploring whether Large Language\nModels (LLMs) can undertake mathematical modeling to potentially decrease human\nlabor. To evaluate of LLMs in mathematical modeling, we introduce a new\nbenchmark, Mamo, that transcends traditional result-oriented assessments.\nUnlike conventional methods that primarily assess LLMs based on the accuracy of\nsolutions to mathematical problems, our approach offers deeper insight into the\nmodeling process itself. By focusing on the processes LLMs undertake rather\nthan the correctness of their final solutions, Mamo pioneers a novel evaluation\nparadigm. This shift underscores the importance of understanding the inherent\nmodeling capabilities of LLMs, paving the way for a more nuanced and\ncomprehensive analysis of their problem-solving strategies. Our work marks a\nsignificant advancement in the field, suggesting a new direction for future\nresearch by emphasizing the evaluation of LLMs' modeling processes over the\nmere correctness of answers. This benchmark not only facilitates a better\nunderstanding of LLMs' mathematical modeling capabilities but also sets a new\nstandard for evaluating their performance in complex problem-solving scenarios.\n","authors":["Xuhan Huang","Qingning Shen","Yan Hu","Anningzhe Gao","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2405.13144v2.pdf","comment":"Project: https://github.com/FreedomIntelligence/Mamo Updates: 1.\n include more models 2. minor modification of the metric with new results 3.\n fix some typos 4. add error analysis with examples"},{"id":"http://arxiv.org/abs/2406.00040v2","updated":"2024-06-30T04:37:19Z","published":"2024-05-27T16:26:50Z","title":"Unveiling Themes in Judicial Proceedings: A Cross-Country Study Using\n Topic Modeling on Legal Documents from India and the UK","summary":" Legal documents are indispensable in every country for legal practices and\nserve as the primary source of information regarding previous cases and\nemployed statutes. In today's world, with an increasing number of judicial\ncases, it is crucial to systematically categorize past cases into subgroups,\nwhich can then be utilized for upcoming cases and practices. Our primary focus\nin this endeavor was to annotate cases using topic modeling algorithms such as\nLatent Dirichlet Allocation, Non-Negative Matrix Factorization, and Bertopic\nfor a collection of lengthy legal documents from India and the UK. This step is\ncrucial for distinguishing the generated labels between the two countries,\nhighlighting the differences in the types of cases that arise in each\njurisdiction. Furthermore, an analysis of the timeline of cases from India was\nconducted to discern the evolution of dominant topics over the years.\n","authors":["Krish Didwania","Dr. Durga Toshniwal","Amit Agarwal"],"pdf_url":"https://arxiv.org/pdf/2406.00040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16789v2","updated":"2024-06-30T02:19:00Z","published":"2024-04-25T17:38:57Z","title":"Continual Learning of Large Language Models: A Comprehensive Survey","summary":" The recent success of large language models (LLMs) trained on static,\npre-collected, general datasets has sparked numerous research directions and\napplications. One such direction addresses the non-trivial challenge of\nintegrating pre-trained LLMs into dynamic data distributions, task structures,\nand user preferences. Pre-trained LLMs, when tailored for specific needs, often\nexperience significant performance degradation in previous knowledge domains --\na phenomenon known as \"catastrophic forgetting\". While extensively studied in\nthe continual learning (CL) community, it presents new manifestations in the\nrealm of LLMs. In this survey, we provide a comprehensive overview of the\ncurrent research progress on LLMs within the context of CL. This survey is\nstructured into four main sections: we first describe an overview of\ncontinually learning LLMs, consisting of two directions of continuity: vertical\ncontinuity (or vertical continual learning), i.e., continual adaptation from\ngeneral to specific capabilities, and horizontal continuity (or horizontal\ncontinual learning), i.e., continual adaptation across time and domains\n(Section 3). We then summarize three stages of learning LLMs in the context of\nmodern CL: Continual Pre-Training (CPT), Domain-Adaptive Pre-training (DAP),\nand Continual Fine-Tuning (CFT) (Section 4). Then we provide an overview of\nevaluation protocols for continual learning with LLMs, along with the current\navailable data sources (Section 5). Finally, we discuss intriguing questions\npertaining to continual learning for LLMs (Section 6). The full list of papers\nexamined in this survey is available at\nhttps://github.com/Wang-ML-Lab/llm-continual-learning-survey.\n","authors":["Haizhou Shi","Zihao Xu","Hengyi Wang","Weiyi Qin","Wenyuan Wang","Yibin Wang","Zifeng Wang","Sayna Ebrahimi","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16789v2.pdf","comment":"47 pages, 2 figures, 4 tables. Work in progress"},{"id":"http://arxiv.org/abs/2401.13170v4","updated":"2024-06-30T02:09:46Z","published":"2024-01-24T01:30:25Z","title":"CFMatch: Aligning Automated Answer Equivalence Evaluation with Expert\n Judgments For Open-Domain Question Answering","summary":" Question answering (QA) can only make progress if we know if an answer is\ncorrect, but for many of the most challenging and interesting QA examples,\ncurrent evaluation metrics to determine answer equivalence (AE) often do not\nalign with human judgments, particularly more verbose, free-form answers from\nlarge language models (LLM). There are two challenges: a lack of data and that\nmodels are too big: LLM-based scorers can correlate better with human judges,\nbut this task has only been tested on limited QA datasets, and even when\navailable, update of the model is limited because LLMs are large and often\nexpensive. We rectify both of these issues by providing clear and consistent\nguidelines for evaluating AE in machine QA adopted from professional human QA\ncontests. We also introduce a combination of standard evaluation and a more\nefficient, robust, and lightweight discriminate AE classifier-based matching\nmethod (CFMatch, smaller than 1 MB), trained and validated to more accurately\nevaluate answer correctness in accordance with adopted expert AE rules that are\nmore aligned with human judgments.\n","authors":["Zongxia Li","Ishani Mondal","Yijun Liang","Huy Nghiem","Jordan Boyd-Graber"],"pdf_url":"https://arxiv.org/pdf/2401.13170v4.pdf","comment":"A duplicate and polished version is in arXiv:2402.11161"},{"id":"http://arxiv.org/abs/2406.13173v2","updated":"2024-06-30T01:22:09Z","published":"2024-06-19T03:07:33Z","title":"Biomedical Visual Instruction Tuning with Clinician Preference Alignment","summary":" Recent advancements in multimodal foundation models have showcased impressive\ncapabilities in understanding and reasoning with visual and textual\ninformation. Adapting these foundation models trained for general usage to\nspecialized domains like biomedicine requires large-scale domain-specific\ninstruction datasets. While existing works have explored curating such datasets\nautomatically, the resultant datasets are not explicitly aligned with domain\nexpertise. In this work, we propose a data-centric framework, Biomedical Visual\nInstruction Tuning with Clinician Preference Alignment (BioMed-VITAL), that\nincorporates clinician preferences into both stages of generating and selecting\ninstruction data for tuning biomedical multimodal foundation models. First,\nduring the generation stage, we prompt the GPT-4V generator with a diverse set\nof clinician-selected demonstrations for preference-aligned data candidate\ngeneration. Then, during the selection phase, we train a separate selection\nmodel, which explicitly distills clinician and policy-guided model preferences\ninto a rating function to select high-quality data for medical instruction\ntuning. Results show that the model tuned with the instruction-following data\nfrom our method demonstrates a significant improvement in open visual chat\n(18.5% relatively) and medical VQA (win rate up to 81.73%). Our\ninstruction-following data and models are available at BioMed-VITAL.github.io.\n","authors":["Hejie Cui","Lingjun Mao","Xin Liang","Jieyu Zhang","Hui Ren","Quanzheng Li","Xiang Li","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2406.13173v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2208.00287v4","updated":"2024-06-30T22:46:54Z","published":"2022-07-30T18:29:11Z","title":"Simplex Clustering via sBeta with Applications to Online Adjustment of\n Black-Box Predictions","summary":" We explore clustering the softmax predictions of deep neural networks and\nintroduce a novel probabilistic clustering method, referred to as k-sBetas. In\nthe general context of clustering discrete distributions, the existing methods\nfocused on exploring distortion measures tailored to simplex data, such as the\nKL divergence, as alternatives to the standard Euclidean distance. We provide a\ngeneral maximum a posteriori (MAP) perspective of clustering distributions,\nemphasizing that the statistical models underlying the existing\ndistortion-based methods may not be descriptive enough. Instead, we optimize a\nmixed-variable objective measuring data conformity within each cluster to the\nintroduced sBeta density function, whose parameters are constrained and\nestimated jointly with binary assignment variables. Our versatile formulation\napproximates various parametric densities for modeling simplex data and enables\nthe control of the cluster-balance bias. This yields highly competitive\nperformances for the unsupervised adjustment of black-box model predictions in\nvarious scenarios. Our code and comparisons with the existing\nsimplex-clustering approaches and our introduced softmax-prediction benchmarks\nare publicly available:\nhttps://github.com/fchiaroni/Clustering_Softmax_Predictions.\n","authors":["Florent Chiaroni","Malik Boudiaf","Amar Mitiche","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2208.00287v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01568v4","updated":"2024-06-30T20:40:24Z","published":"2024-04-02T02:01:21Z","title":"A Linear Time and Space Local Point Cloud Geometry Encoder via\n Vectorized Kernel Mixture (VecKM)","summary":" We propose VecKM, a local point cloud geometry encoder that is descriptive\nand efficient to compute. VecKM leverages a unique approach by vectorizing a\nkernel mixture to represent the local point cloud. Such representation's\ndescriptiveness is supported by two theorems that validate its ability to\nreconstruct and preserve the similarity of the local shape. Unlike existing\nencoders downsampling the local point cloud, VecKM constructs the local\ngeometry encoding using all neighboring points, producing a more descriptive\nencoding. Moreover, VecKM is efficient to compute and scalable to large point\ncloud inputs: VecKM reduces the memory cost from $(n^2+nKd)$ to $(nd+np)$; and\nreduces the major runtime cost from computing $nK$ MLPs to $n$ MLPs, where $n$\nis the size of the point cloud, $K$ is the neighborhood size, $d$ is the\nencoding dimension, and $p$ is a marginal factor. The efficiency is due to\nVecKM's unique factorizable property that eliminates the need of explicitly\ngrouping points into neighbors. In the normal estimation task, VecKM\ndemonstrates not only 100x faster inference speed but also highest accuracy and\nstrongest robustness. In classification and segmentation tasks, integrating\nVecKM as a preprocessing module achieves consistently better performance than\nthe PointNet, PointNet++, and point transformer baselines, and runs\nconsistently faster by up to 10 times.\n","authors":["Dehao Yuan","Cornelia Fermüller","Tahseen Rabbani","Furong Huang","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2404.01568v4.pdf","comment":"ICML2024 Conference Paper"},{"id":"http://arxiv.org/abs/2403.20309v2","updated":"2024-06-30T19:47:58Z","published":"2024-03-29T17:29:58Z","title":"InstantSplat: Unbounded Sparse-view Pose-free Gaussian Splatting in 40\n Seconds","summary":" While novel view synthesis (NVS) from a sparse set of images has advanced\nsignificantly in 3D computer vision, it relies on precise initial estimation of\ncamera parameters using Structure-from-Motion (SfM). For instance, the recently\ndeveloped Gaussian Splatting depends heavily on the accuracy of SfM-derived\npoints and poses. However, SfM processes are time-consuming and often prove\nunreliable in sparse-view scenarios, where matched features are scarce, leading\nto accumulated errors and limited generalization capability across datasets. In\nthis study, we introduce a novel and efficient framework to enhance robust NVS\nfrom sparse-view images. Our framework, InstantSplat, integrates multi-view\nstereo(MVS) predictions with point-based representations to construct 3D\nGaussians of large-scale scenes from sparse-view data within seconds,\naddressing the aforementioned performance and efficiency issues by SfM.\nSpecifically, InstantSplat generates densely populated surface points across\nall training views and determines the initial camera parameters using\npixel-alignment. Nonetheless, the MVS points are not globally accurate, and the\npixel-wise prediction from all views results in an excessive Gaussian number,\nyielding a overparameterized scene representation that compromises both\ntraining speed and accuracy. To address this issue, we employ a grid-based,\nconfidence-aware Farthest Point Sampling to strategically position point\nprimitives at representative locations in parallel. Next, we enhance pose\naccuracy and tune scene parameters through a gradient-based joint optimization\nframework from self-supervision. By employing this simplified framework,\nInstantSplat achieves a substantial reduction in training time, from hours to\nmere seconds, and demonstrates robust performance across various numbers of\nviews in diverse datasets.\n","authors":["Zhiwen Fan","Wenyan Cong","Kairun Wen","Kevin Wang","Jian Zhang","Xinghao Ding","Danfei Xu","Boris Ivanovic","Marco Pavone","Georgios Pavlakos","Zhangyang Wang","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2403.20309v2.pdf","comment":"Project Page: https://instantsplat.github.io/"},{"id":"http://arxiv.org/abs/2405.12853v2","updated":"2024-06-30T19:31:14Z","published":"2024-05-21T15:11:35Z","title":"Inconsistency-Aware Cross-Attention for Audio-Visual Fusion in\n Dimensional Emotion Recognition","summary":" Leveraging complementary relationships across modalities has recently drawn a\nlot of attention in multimodal emotion recognition. Most of the existing\napproaches explored cross-attention to capture the complementary relationships\nacross the modalities. However, the modalities may also exhibit weak\ncomplementary relationships, which may deteriorate the cross-attended features,\nresulting in poor multimodal feature representations. To address this problem,\nwe propose Inconsistency-Aware Cross-Attention (IACA), which can adaptively\nselect the most relevant features on-the-fly based on the strong or weak\ncomplementary relationships across audio and visual modalities. Specifically,\nwe design a two-stage gating mechanism that can adaptively select the\nappropriate relevant features to deal with weak complementary relationships.\nExtensive experiments are conducted on the challenging Aff-Wild2 dataset to\nshow the robustness of the proposed model.\n","authors":["G Rajasekhar","Jahangir Alam"],"pdf_url":"https://arxiv.org/pdf/2405.12853v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2403.19554"},{"id":"http://arxiv.org/abs/2406.18610v2","updated":"2024-06-30T17:40:41Z","published":"2024-06-25T00:16:57Z","title":"Vox-UDA: Voxel-wise Unsupervised Domain Adaptation for Cryo-Electron\n Subtomogram Segmentation with Denoised Pseudo Labeling","summary":" Cryo-Electron Tomography (cryo-ET) is a 3D imaging technology facilitating\nthe study of macromolecular structures at near-atomic resolution. Recent\nvolumetric segmentation approaches on cryo-ET images have drawn widespread\ninterest in biological sector. However, existing methods heavily rely on\nmanually labeled data, which requires highly professional skills, thereby\nhindering the adoption of fully-supervised approaches for cryo-ET images. Some\nunsupervised domain adaptation (UDA) approaches have been designed to enhance\nthe segmentation network performance using unlabeled data. However, applying\nthese methods directly to cryo-ET images segmentation tasks remains challenging\ndue to two main issues: 1) the source data, usually obtained through\nsimulation, contain a certain level of noise, while the target data, directly\ncollected from raw-data from real-world scenario, have unpredictable noise\nlevels. 2) the source data used for training typically consists of known\nmacromoleculars, while the target domain data are often unknown, causing the\nmodel's segmenter to be biased towards these known macromolecules, leading to a\ndomain shift problem. To address these challenges, in this work, we introduce\nthe first voxel-wise unsupervised domain adaptation approach, termed Vox-UDA,\nspecifically for cryo-ET subtomogram segmentation. Vox-UDA incorporates a noise\ngeneration module to simulate target-like noises in the source dataset for\ncross-noise level adaptation. Additionally, we propose a denoised\npseudo-labeling strategy based on improved Bilateral Filter to alleviate the\ndomain shift problem. Experimental results on both simulated and real cryo-ET\nsubtomogram datasets demonstrate the superiority of our proposed approach\ncompared to state-of-the-art UDA methods.\n","authors":["Haoran Li","Xingjian Li","Jiahua Shi","Huaming Chen","Bo Du","Daisuke Kihara","Johan Barthelemy","Jun Shen","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2406.18610v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2406.09317v2","updated":"2024-06-30T17:32:15Z","published":"2024-06-13T16:53:57Z","title":"Common and Rare Fundus Diseases Identification Using Vision-Language\n Foundation Model with Knowledge of Over 400 Diseases","summary":" Previous foundation models for retinal images were pre-trained with limited\ndisease categories and knowledge base. Here we introduce RetiZero, a\nvision-language foundation model that leverages knowledge from over 400 fundus\ndiseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired\nwith text descriptions, sourced from public datasets, ophthalmic literature,\nand online resources, encompassing a diverse range of diseases across multiple\nethnicities and countries. RetiZero exhibits superior performance in several\ndownstream tasks, including zero-shot disease recognition, image-to-image\nretrieval, and internal- and cross-domain disease identification. In zero-shot\nscenarios, RetiZero achieves Top5 accuracy scores of 0.8430 for 15 fundus\ndiseases and 0.7561 for 52 fundus diseases. For image retrieval, it achieves\nTop5 scores of 0.9500 and 0.8860 for the same disease sets, respectively.\nClinical evaluations show that RetiZero's Top3 zero-shot performance surpasses\nthe average of 19 ophthalmologists from Singapore, China and the United States.\nFurthermore, RetiZero significantly enhances clinicians' accuracy in diagnosing\nfundus disease. These findings underscore the value of integrating the RetiZero\nfoundation model into clinical settings, where a variety of fundus diseases are\nencountered.\n","authors":["Meng Wang","Tian Lin","Aidi Lin","Kai Yu","Yuanyuan Peng","Lianyu Wang","Cheng Chen","Ke Zou","Huiyu Liang","Man Chen","Xue Yao","Meiqin Zhang","Binwei Huang","Chaoxin Zheng","Peixin Zhang","Wei Chen","Yilong Luo","Yifan Chen","Honghe Xia","Tingkun Shi","Qi Zhang","Jinming Guo","Xiaolin Chen","Jingcheng Wang","Yih Chung Tham","Dianbo Liu","Wendy Wong","Sahil Thakur","Beau Fenner","Danqi Fang","Siying Liu","Qingyun Liu","Yuqiang Huang","Hongqiang Zeng","Yanda Meng","Yukun Zhou","Zehua Jiang","Minghui Qiu","Changqing Zhang","Xinjian Chen","Sophia Y Wang","Cecilia S Lee","Lucia Sobrin","Carol Y Cheung","Chi Pui Pang","Pearse A Keane","Ching-Yu Cheng","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2406.09317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.01324v2","updated":"2024-06-30T16:25:48Z","published":"2022-04-04T08:57:34Z","title":"Outlier-Robust Geometric Perception: A Novel Thresholding-Based\n Estimator with Intra-Class Variance Maximization","summary":" Geometric perception problems are fundamental tasks in robotics and computer\nvision. In real-world applications, they often encounter the inevitable issue\nof outliers, preventing traditional algorithms from making correct estimates.\nIn this paper, we present a novel general-purpose robust estimator TIVM\n(Thresholding with Intra-class Variance Maximization) that can collaborate with\nstandard non-minimal solvers to efficiently reject outliers for geometric\nperception problems. First, we introduce the technique of intra-class variance\nmaximization to design a dynamic 2-group thresholding method on the measurement\nresiduals, aiming to distinctively separate inliers from outliers. Then, we\ndevelop an iterative framework that robustly optimizes the model by approaching\nthe pure-inlier group using a multi-layered dynamic thresholding strategy as\nsubroutine, in which a self-adaptive mechanism for layer-number tuning is\nfurther employed to minimize the user-defined parameters. We validate the\nproposed estimator on 3 classic geometric perception problems: rotation\naveraging, point cloud registration and category-level perception, and\nexperiments show that it is robust against 70--90\\% of outliers and can\nconverge typically in only 3--15 iterations, much faster than state-of-the-art\nrobust solvers such as RANSAC, GNC and ADAPT. Furthermore, another highlight is\nthat: our estimator can retain approximately the same level of robustness even\nwhen the inlier-noise statistics of the problem are fully unknown.\n","authors":["Lei Sun"],"pdf_url":"https://arxiv.org/pdf/2204.01324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12120v2","updated":"2024-06-30T15:39:53Z","published":"2024-04-18T12:13:09Z","title":"Fortify the Guardian, Not the Treasure: Resilient Adversarial Detectors","summary":" This paper presents RADAR-Robust Adversarial Detection via Adversarial\nRetraining-an approach designed to enhance the robustness of adversarial\ndetectors against adaptive attacks, while maintaining classifier performance.\nAn adaptive attack is one where the attacker is aware of the defenses and\nadapts their strategy accordingly. Our proposed method leverages adversarial\ntraining to reinforce the ability to detect attacks, without compromising clean\naccuracy. During the training phase, we integrate into the dataset adversarial\nexamples, which were optimized to fool both the classifier and the adversarial\ndetector, enabling the adversarial detector to learn and adapt to potential\nattack scenarios. Experimental evaluations on the CIFAR-10 and SVHN datasets\ndemonstrate that our proposed algorithm significantly improves a detector's\nability to accurately identify adaptive adversarial attacks -- without\nsacrificing clean accuracy.\n","authors":["Raz Lapid","Almog Dubin","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2404.12120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05317v5","updated":"2024-06-30T15:03:42Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v5.pdf","comment":"draftcls option"},{"id":"http://arxiv.org/abs/2406.02540v2","updated":"2024-06-30T14:41:22Z","published":"2024-06-04T17:57:10Z","title":"ViDiT-Q: Efficient and Accurate Quantization of Diffusion Transformers\n for Image and Video Generation","summary":" Diffusion transformers (DiTs) have exhibited remarkable performance in visual\ngeneration tasks, such as generating realistic images or videos based on\ntextual instructions. However, larger model sizes and multi-frame processing\nfor video generation lead to increased computational and memory costs, posing\nchallenges for practical deployment on edge devices. Post-Training Quantization\n(PTQ) is an effective method for reducing memory costs and computational\ncomplexity. When quantizing diffusion transformers, we find that applying\nexisting diffusion quantization methods designed for U-Net faces challenges in\npreserving quality. After analyzing the major challenges for quantizing\ndiffusion transformers, we design an improved quantization scheme: \"ViDiT-Q\":\nVideo and Image Diffusion Transformer Quantization) to address these issues.\nFurthermore, we identify highly sensitive layers and timesteps hinder\nquantization for lower bit-widths. To tackle this, we improve ViDiT-Q with a\nnovel metric-decoupled mixed-precision quantization method (ViDiT-Q-MP). We\nvalidate the effectiveness of ViDiT-Q across a variety of text-to-image and\nvideo models. While baseline quantization methods fail at W8A8 and produce\nunreadable content at W4A8, ViDiT-Q achieves lossless W8A8 quantization.\nViDiTQ-MP achieves W4A8 with negligible visual quality degradation, resulting\nin a 2.5x memory optimization and a 1.5x latency speedup.\n","authors":["Tianchen Zhao","Tongcheng Fang","Enshu Liu","Rui Wan","Widyadewi Soedarmadji","Shiyao Li","Zinan Lin","Guohao Dai","Shengen Yan","Huazhong Yang","Xuefei Ning","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2406.02540v2.pdf","comment":"Project Page: https://a-suozhang.xyz/viditq.github.io/"},{"id":"http://arxiv.org/abs/2406.10508v2","updated":"2024-06-30T14:28:34Z","published":"2024-06-15T05:21:33Z","title":"Learning to Adapt Foundation Model DINOv2 for Capsule Endoscopy\n Diagnosis","summary":" Foundation models have become prominent in computer vision, achieving notable\nsuccess in various tasks. However, their effectiveness largely depends on\npre-training with extensive datasets. Applying foundation models directly to\nsmall datasets of capsule endoscopy images from scratch is challenging.\nPre-training on broad, general vision datasets is crucial for successfully\nfine-tuning our model for specific tasks. In this work, we introduce a\nsimplified approach called Adapt foundation models with a low-rank adaptation\n(LoRA) technique for easier customization. Our method, inspired by the DINOv2\nfoundation model, applies low-rank adaptation learning to tailor foundation\nmodels for capsule endoscopy diagnosis effectively. Unlike traditional\nfine-tuning methods, our strategy includes LoRA layers designed to absorb\nspecific surgical domain knowledge. During the training process, we keep the\nmain model (the backbone encoder) fixed and focus on optimizing the LoRA layers\nand the disease classification component. We tested our method on two publicly\navailable datasets for capsule endoscopy disease classification. The results\nwere impressive, with our model achieving 97.75% accuracy on the Kvasir-Capsule\ndataset and 98.81% on the Kvasirv2 dataset. Our solution demonstrates that\nfoundation models can be adeptly adapted for capsule endoscopy diagnosis,\nhighlighting that mere reliance on straightforward fine-tuning or pre-trained\nmodels from general computer vision tasks is inadequate for such specific\napplications.\n","authors":["Bowen Zhang","Ying Chen","Long Bai","Yan Zhao","Yuxiang Sun","Yixuan Yuan","Jianhua Zhang","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2406.10508v2.pdf","comment":"To appear in ICBIR 2024"},{"id":"http://arxiv.org/abs/2406.15921v2","updated":"2024-06-30T12:04:43Z","published":"2024-06-22T19:21:42Z","title":"PUDD: Towards Robust Multi-modal Prototype-based Deepfake Detection","summary":" Deepfake techniques generate highly realistic data, making it challenging for\nhumans to discern between actual and artificially generated images. Recent\nadvancements in deep learning-based deepfake detection methods, particularly\nwith diffusion models, have shown remarkable progress. However, there is a\ngrowing demand for real-world applications to detect unseen individuals,\ndeepfake techniques, and scenarios. To address this limitation, we propose a\nPrototype-based Unified Framework for Deepfake Detection (PUDD). PUDD offers a\ndetection system based on similarity, comparing input data against known\nprototypes for video classification and identifying potential deepfakes or\npreviously unseen classes by analyzing drops in similarity. Our extensive\nexperiments reveal three key findings: (1) PUDD achieves an accuracy of 95.1%\non Celeb-DF, outperforming state-of-the-art deepfake detection methods; (2)\nPUDD leverages image classification as the upstream task during training,\ndemonstrating promising performance in both image classification and deepfake\ndetection tasks during inference; (3) PUDD requires only 2.7 seconds for\nretraining on new data and emits 10$^{5}$ times less carbon compared to the\nstate-of-the-art model, making it significantly more environmentally friendly.\n","authors":["Alvaro Lopez Pellcier","Yi Li","Plamen Angelov"],"pdf_url":"https://arxiv.org/pdf/2406.15921v2.pdf","comment":"CVPR2024"},{"id":"http://arxiv.org/abs/2402.15852v7","updated":"2024-06-30T11:14:13Z","published":"2024-02-24T16:39:16Z","title":"NaVid: Video-based VLM Plans the Next Step for Vision-and-Language\n Navigation","summary":" Vision-and-language navigation (VLN) stands as a key research problem of\nEmbodied AI, aiming at enabling agents to navigate in unseen environments\nfollowing linguistic instructions. In this field, generalization is a\nlong-standing challenge, either to out-of-distribution scenes or from Sim to\nReal. In this paper, we propose NaVid, a video-based large vision language\nmodel (VLM), to mitigate such a generalization gap. NaVid makes the first\nendeavor to showcase the capability of VLMs to achieve state-of-the-art level\nnavigation performance without any maps, odometers, or depth inputs. Following\nhuman instruction, NaVid only requires an on-the-fly video stream from a\nmonocular RGB camera equipped on the robot to output the next-step action. Our\nformulation mimics how humans navigate and naturally gets rid of the problems\nintroduced by odometer noises, and the Sim2Real gaps from map or depth inputs.\nMoreover, our video-based approach can effectively encode the historical\nobservations of robots as spatio-temporal contexts for decision making and\ninstruction following. We train NaVid with 510k navigation samples collected\nfrom continuous environments, including action-planning and\ninstruction-reasoning samples, along with 763k large-scale web data. Extensive\nexperiments show that NaVid achieves state-of-the-art performance in simulation\nenvironments and the real world, demonstrating superior cross-dataset and\nSim2Real transfer. We thus believe our proposed VLM approach plans the next\nstep for not only the navigation agents but also this research field.\n","authors":["Jiazhao Zhang","Kunyu Wang","Rongtao Xu","Gengze Zhou","Yicong Hong","Xiaomeng Fang","Qi Wu","Zhizheng Zhang","He Wang"],"pdf_url":"https://arxiv.org/pdf/2402.15852v7.pdf","comment":"Accepted by Robotics: Science and Systems (RSS 2024)"},{"id":"http://arxiv.org/abs/2211.07955v2","updated":"2024-06-30T09:35:25Z","published":"2022-11-15T07:41:00Z","title":"IntegratedPIFu: Integrated Pixel Aligned Implicit Function for\n Single-view Human Reconstruction","summary":" We propose IntegratedPIFu, a new pixel aligned implicit model that builds on\nthe foundation set by PIFuHD. IntegratedPIFu shows how depth and human parsing\ninformation can be predicted and capitalised upon in a pixel-aligned implicit\nmodel. In addition, IntegratedPIFu introduces depth oriented sampling, a novel\ntraining scheme that improve any pixel aligned implicit model ability to\nreconstruct important human features without noisy artefacts. Lastly,\nIntegratedPIFu presents a new architecture that, despite using less model\nparameters than PIFuHD, is able to improves the structural correctness of\nreconstructed meshes. Our results show that IntegratedPIFu significantly\noutperforms existing state of the arts methods on single view human\nreconstruction. Our code has been made available online.\n","authors":["Kennard Yanting Chan","Guosheng Lin","Haiyu Zhao","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2211.07955v2.pdf","comment":"Accepted to ECCV 2022"},{"id":"http://arxiv.org/abs/2402.02956v4","updated":"2024-06-30T09:08:55Z","published":"2024-02-05T12:34:03Z","title":"AdaTreeFormer: Few Shot Domain Adaptation for Tree Counting from a\n Single High-Resolution Image","summary":" The process of estimating and counting tree density using only a single\naerial or satellite image is a difficult task in the fields of photogrammetry\nand remote sensing. However, it plays a crucial role in the management of\nforests. The huge variety of trees in varied topography severely hinders tree\ncounting models to perform well. The purpose of this paper is to propose a\nframework that is learnt from the source domain with sufficient labeled trees\nand is adapted to the target domain with only a limited number of labeled\ntrees. Our method, termed as AdaTreeFormer, contains one shared encoder with a\nhierarchical feature extraction scheme to extract robust features from the\nsource and target domains. It also consists of three subnets: two for\nextracting self-domain attention maps from source and target domains\nrespectively and one for extracting cross-domain attention maps. For the\nlatter, an attention-to-adapt mechanism is introduced to distill relevant\ninformation from different domains while generating tree density maps; a\nhierarchical cross-domain feature alignment scheme is proposed that\nprogressively aligns the features from the source and target domains. We also\nadopt adversarial learning into the framework to further reduce the gap between\nsource and target domains. Our AdaTreeFormer is evaluated on six designed\ndomain adaptation tasks using three tree counting datasets, \\ie Jiangsu,\nYosemite, and London. Experimental results show that AdaTreeFormer\nsignificantly surpasses the state of the art, \\eg in the cross domain from the\nYosemite to Jiangsu dataset, it achieves a reduction of 15.9 points in terms of\nthe absolute counting errors and an increase of 10.8\\% in the accuracy of the\ndetected trees' locations. The codes and datasets are available at\nhttps://github.com/HAAClassic/AdaTreeFormer.\n","authors":["Hamed Amini Amirkolaee","Miaojing Shi","Lianghua He","Mark Mulligan"],"pdf_url":"https://arxiv.org/pdf/2402.02956v4.pdf","comment":"Accepted in ISPRS Journal of Photogrammetry and Remote Sensing"},{"id":"http://arxiv.org/abs/2404.11741v2","updated":"2024-06-30T08:39:12Z","published":"2024-04-17T20:48:19Z","title":"Diffusion Schrödinger Bridge Models for High-Quality MR-to-CT\n Synthesis for Head and Neck Proton Treatment Planning","summary":" In recent advancements in proton therapy, MR-based treatment planning is\ngaining momentum to minimize additional radiation exposure compared to\ntraditional CT-based methods. This transition highlights the critical need for\naccurate MR-to-CT image synthesis, which is essential for precise proton dose\ncalculations. Our research introduces the Diffusion Schr\\\"odinger Bridge Models\n(DSBM), an innovative approach for high-quality MR-to-CT synthesis. DSBM learns\nthe nonlinear diffusion processes between MR and CT data distributions. This\nmethod improves upon traditional diffusion models by initiating synthesis from\nthe prior distribution rather than the Gaussian distribution, enhancing both\ngeneration quality and efficiency. We validated the effectiveness of DSBM on a\nhead and neck cancer dataset, demonstrating its superiority over traditional\nimage synthesis methods through both image-level and dosimetric-level\nevaluations. The effectiveness of DSBM in MR-based proton treatment planning\nhighlights its potential as a valuable tool in various clinical scenarios.\n","authors":["Muheng Li","Xia Li","Sairos Safai","Damien Weber","Antony Lomax","Ye Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.11741v2.pdf","comment":"International Conference on the use of Computers in Radiation therapy\n (ICCR)"},{"id":"http://arxiv.org/abs/2406.19057v2","updated":"2024-06-30T07:54:30Z","published":"2024-06-27T10:08:29Z","title":"Segment Anything Model for automated image data annotation: empirical\n studies using text prompts from Grounding DINO","summary":" Grounding DINO and the Segment Anything Model (SAM) have achieved impressive\nperformance in zero-shot object detection and image segmentation, respectively.\nTogether, they have a great potential to revolutionize applications in\nzero-shot semantic segmentation or data annotation. Yet, in specialized domains\nlike medical image segmentation, objects of interest (e.g., organs, tissues,\nand tumors) may not fall in existing class names. To address this problem, the\nreferring expression comprehension (REC) ability of Grounding DINO is leveraged\nto detect arbitrary targets by their language descriptions. However, recent\nstudies have highlighted severe limitation of the REC framework in this\napplication setting owing to its tendency to make false positive predictions\nwhen the target is absent in the given image. And, while this bottleneck is\ncentral to the prospect of open-set semantic segmentation, it is still largely\nunknown how much improvement can be achieved by studying the prediction errors.\nTo this end, we perform empirical studies on six publicly available datasets\nacross different domains and reveal that these errors consistently follow a\npredictable pattern and can, thus, be mitigated by a simple strategy.\nSpecifically, we show that false positive detections with appreciable\nconfidence scores generally occupy large image areas and can usually be\nfiltered by their relative sizes. More importantly, we expect these\nobservations to inspire future research in improving REC-based detection and\nautomated segmentation. Meanwhile, we evaluate the performance of SAM on\nmultiple datasets from various specialized domains and report significant\nimprovements in segmentation performance and annotation time savings over\nmanual approaches.\n","authors":["Fuseini Mumuni","Alhassan Mumuni"],"pdf_url":"https://arxiv.org/pdf/2406.19057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11748v2","updated":"2024-06-30T07:50:27Z","published":"2023-04-23T20:53:05Z","title":"IDLS: Inverse Depth Line based Visual-Inertial SLAM","summary":" For robust visual-inertial SLAM in perceptually-challenging indoor\nenvironments,recent studies exploit line features to extract descriptive\ninformation about scene structure to deal with the degeneracy of point\nfeatures. But existing point-line-based SLAM methods mainly use Pl\\\"ucker\nmatrix or orthogonal representation to represent a line, which needs to\ncalculate at least four variables to determine a line. Given the numerous line\nfeatures to determine in each frame, the overly flexible line representation\nincreases the computation burden and comprises the accuracy of the results. In\nthis paper, we propose inverse depth representation for a line, which models\neach extracted line feature using only two variables, i.e., the inverse depths\nof the two ending points. It exploits the fact that the projected line's pixel\ncoordinates on the image plane are rather accurate, which partially restrict\nthe line. Using this compact line presentation, Inverse Depth Line SLAM (IDLS)\nis proposed to track the line features in SLAM in an accurate and efficient\nway. A robust line triangulation method and a novel line re-projection error\nmodel are introduced. And a two-step optimization method is proposed to firstly\ndetermine the lines and then to estimate the camera poses in each frame. IDLS\nis extensively evaluated in multiple perceptually-challenging datasets. The\nresults show it is more accurate, robust, and needs lower computational\noverhead than the current state-of-the-art of point-line-based SLAM methods.\n","authors":["Wanting Li","Shuo Wang","Yongcai Wang","Yu Shao","Xuewei Bai","Deying Li"],"pdf_url":"https://arxiv.org/pdf/2304.11748v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07648v2","updated":"2024-06-30T07:36:35Z","published":"2024-05-13T11:13:17Z","title":"CDFormer:When Degradation Prediction Embraces Diffusion Model for Blind\n Image Super-Resolution","summary":" Existing Blind image Super-Resolution (BSR) methods focus on estimating\neither kernel or degradation information, but have long overlooked the\nessential content details. In this paper, we propose a novel BSR approach,\nContent-aware Degradation-driven Transformer (CDFormer), to capture both\ndegradation and content representations. However, low-resolution images cannot\nprovide enough content details, and thus we introduce a diffusion-based module\n$CDFormer_{diff}$ to first learn Content Degradation Prior (CDP) in both low-\nand high-resolution images, and then approximate the real distribution given\nonly low-resolution information. Moreover, we apply an adaptive SR network\n$CDFormer_{SR}$ that effectively utilizes CDP to refine features. Compared to\nprevious diffusion-based SR methods, we treat the diffusion model as an\nestimator that can overcome the limitations of expensive sampling time and\nexcessive diversity. Experiments show that CDFormer can outperform existing\nmethods, establishing a new state-of-the-art performance on various benchmarks\nunder blind settings. Codes and models will be available at\n\\href{https://github.com/I2-Multimedia-Lab/CDFormer}{https://github.com/I2-Multimedia-Lab/CDFormer}.\n","authors":["Qingguo Liu","Chenyi Zhuang","Pan Gao","Jie Qin"],"pdf_url":"https://arxiv.org/pdf/2405.07648v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08085v2","updated":"2024-06-30T05:39:46Z","published":"2024-06-12T11:07:55Z","title":"Flash-VStream: Memory-Based Real-Time Understanding for Long Video\n Streams","summary":" Benefiting from the advancements in large language models and cross-modal\nalignment, existing multi-modal video understanding methods have achieved\nprominent performance in offline scenario. However, online video streams, as\none of the most common media forms in the real world, have seldom received\nattention. Compared to offline videos, the 'dynamic' nature of online video\nstreams poses challenges for the direct application of existing models and\nintroduces new problems, such as the storage of extremely long-term\ninformation, interaction between continuous visual content and 'asynchronous'\nuser questions. Therefore, in this paper we present Flash-VStream, a\nvideo-language model that simulates the memory mechanism of human. Our model is\nable to process extremely long video streams in real-time and respond to user\nqueries simultaneously. Compared to existing models, Flash-VStream achieves\nsignificant reductions in inference latency and VRAM consumption, which is\nintimately related to performing understanding of online streaming video. In\naddition, given that existing video understanding benchmarks predominantly\nconcentrate on offline scenario, we propose VStream-QA, a novel question\nanswering benchmark specifically designed for online video streaming\nunderstanding. Comparisons with popular existing methods on the proposed\nbenchmark demonstrate the superiority of our method for such challenging\nsetting. To verify the generalizability of our approach, we further evaluate it\non existing video understanding benchmarks and achieves state-of-the-art\nperformance in offline scenarios as well. All code, models, and datasets are\navailable at the https://invinciblewyq.github.io/vstream-page/\n","authors":["Haoji Zhang","Yiqin Wang","Yansong Tang","Yong Liu","Jiashi Feng","Jifeng Dai","Xiaojie Jin"],"pdf_url":"https://arxiv.org/pdf/2406.08085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14739v2","updated":"2024-06-30T04:00:09Z","published":"2024-04-23T04:45:23Z","title":"BMapEst: Estimation of Brain Tissue Probability Maps using a\n Differentiable MRI Simulator","summary":" Reconstructing digital brain phantoms in the form of voxel-based,\nmulti-channeled tissue probability maps for individual subjects is essential\nfor capturing brain anatomical variability, understanding neurological\ndiseases, as well as for testing image processing methods. We demonstrate the\nfirst framework that estimates brain tissue probability maps (Grey Matter - GM,\nWhite Matter - WM, and Cerebrospinal fluid - CSF) with the help of a\nPhysics-based differentiable MRI simulator that models the magnetization signal\nat each voxel in the volume. Given an observed $T_1$/$T_2$-weighted MRI scan,\nthe corresponding clinical MRI sequence, and the MRI differentiable simulator,\nwe estimate the simulator's input probability maps by back-propagating the L2\nloss between the simulator's output and the $T_1$/$T_2$-weighted scan. This\napproach has the significant advantage of not relying on any training data and\ninstead uses the strong inductive bias of the MRI simulator. We tested the\nmodel on 20 scans from the BrainWeb database and demonstrated a highly accurate\nreconstruction of GM, WM, and CSF. Our source code is available online:\nhttps://github.com/BioMedAI-UCSC/BMapEst.\n","authors":["Utkarsh Gupta","Emmanouil Nikolakakis","Moritz Zaiss","Razvan Marinescu"],"pdf_url":"https://arxiv.org/pdf/2404.14739v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13173v2","updated":"2024-06-30T01:22:09Z","published":"2024-06-19T03:07:33Z","title":"Biomedical Visual Instruction Tuning with Clinician Preference Alignment","summary":" Recent advancements in multimodal foundation models have showcased impressive\ncapabilities in understanding and reasoning with visual and textual\ninformation. Adapting these foundation models trained for general usage to\nspecialized domains like biomedicine requires large-scale domain-specific\ninstruction datasets. While existing works have explored curating such datasets\nautomatically, the resultant datasets are not explicitly aligned with domain\nexpertise. In this work, we propose a data-centric framework, Biomedical Visual\nInstruction Tuning with Clinician Preference Alignment (BioMed-VITAL), that\nincorporates clinician preferences into both stages of generating and selecting\ninstruction data for tuning biomedical multimodal foundation models. First,\nduring the generation stage, we prompt the GPT-4V generator with a diverse set\nof clinician-selected demonstrations for preference-aligned data candidate\ngeneration. Then, during the selection phase, we train a separate selection\nmodel, which explicitly distills clinician and policy-guided model preferences\ninto a rating function to select high-quality data for medical instruction\ntuning. Results show that the model tuned with the instruction-following data\nfrom our method demonstrates a significant improvement in open visual chat\n(18.5% relatively) and medical VQA (win rate up to 81.73%). Our\ninstruction-following data and models are available at BioMed-VITAL.github.io.\n","authors":["Hejie Cui","Lingjun Mao","Xin Liang","Jieyu Zhang","Hui Ren","Quanzheng Li","Xiang Li","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2406.13173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15683v2","updated":"2024-06-30T01:21:46Z","published":"2024-04-24T06:35:56Z","title":"AnoFPDM: Anomaly Segmentation with Forward Process of Diffusion Models\n for Brain MRI","summary":" Weakly-supervised diffusion models (DMs) in anomaly segmentation, leveraging\nimage-level labels, have attracted significant attention for their superior\nperformance compared to unsupervised methods. It eliminates the need for\npixel-level labels in training, offering a more cost-effective alternative to\nsupervised methods. However, existing methods are not fully weakly-supervised\nbecause they heavily rely on costly pixel-level labels for hyperparameter\ntuning in inference. To tackle this challenge, we introduce Anomaly\nSegmentation with Forward Process of Diffusion Models (AnoFPDM), a fully\nweakly-supervised framework that operates without the need of pixel-level\nlabels. Leveraging the unguided forward process as a reference for the guided\nforward process, we select hyperparameters such as the noise scale, the\nthreshold for segmentation and the guidance strength. We aggregate anomaly maps\nfrom guided forward process, enhancing the signal strength of anomalous\nregions. Remarkably, our proposed method outperforms recent state-of-the-art\nweakly-supervised approaches, even without utilizing pixel-level labels.\n","authors":["Yiming Che","Fazle Rafsani","Jay Shah","Md Mahfuzur Rahman Siddiquee","Teresa Wu"],"pdf_url":"https://arxiv.org/pdf/2404.15683v2.pdf","comment":"v2: updated introduction, experiments and supplementary material"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2403.03536v2","updated":"2024-06-30T04:00:06Z","published":"2024-03-06T08:31:35Z","title":"Towards Efficient and Effective Unlearning of Large Language Models for\n Recommendation","summary":" The significant advancements in large language models (LLMs) give rise to a\npromising research direction, i.e., leveraging LLMs as recommenders (LLMRec).\nThe efficacy of LLMRec arises from the open-world knowledge and reasoning\ncapabilities inherent in LLMs. LLMRec acquires the recommendation capabilities\nthrough instruction tuning based on user interaction data. However, in order to\nprotect user privacy and optimize utility, it is also crucial for LLMRec to\nintentionally forget specific user data, which is generally referred to as\nrecommendation unlearning. In the era of LLMs, recommendation unlearning poses\nnew challenges for LLMRec in terms of \\textit{inefficiency} and\n\\textit{ineffectiveness}. Existing unlearning methods require updating billions\nof parameters in LLMRec, which is costly and time-consuming. Besides, they\nalways impact the model utility during the unlearning process. To this end, we\npropose \\textbf{E2URec}, the first \\underline{E}fficient and\n\\underline{E}ffective \\underline{U}nlearning method for LLM\\underline{Rec}. Our\nproposed E2URec enhances the unlearning efficiency by updating only a few\nadditional LoRA parameters, and improves the unlearning effectiveness by\nemploying a teacher-student framework, where we maintain multiple teacher\nnetworks to guide the unlearning process. Extensive experiments show that\nE2URec outperforms state-of-the-art baselines on two real-world datasets.\nSpecifically, E2URec can efficiently forget specific data without affecting\nrecommendation performance. The source code is at\n\\url{https://github.com/justarter/E2URec}.\n","authors":["Hangyu Wang","Jianghao Lin","Bo Chen","Yang Yang","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2403.03536v2.pdf","comment":"Accepted by Frontier of Computer Science"},{"id":"http://arxiv.org/abs/2406.15859v2","updated":"2024-06-30T02:13:19Z","published":"2024-06-22T14:14:03Z","title":"LLM-Powered Explanations: Unraveling Recommendations Through Subgraph\n Reasoning","summary":" Recommender systems are pivotal in enhancing user experiences across various\nweb applications by analyzing the complicated relationships between users and\nitems. Knowledge graphs(KGs) have been widely used to enhance the performance\nof recommender systems. However, KGs are known to be noisy and incomplete,\nwhich are hard to provide reliable explanations for recommendation results. An\nexplainable recommender system is crucial for the product development and\nsubsequent decision-making. To address these challenges, we introduce a novel\nrecommender that synergies Large Language Models (LLMs) and KGs to enhance the\nrecommendation and provide interpretable results. Specifically, we first\nharness the power of LLMs to augment KG reconstruction. LLMs comprehend and\ndecompose user reviews into new triples that are added into KG. In this way, we\ncan enrich KGs with explainable paths that express user preferences. To enhance\nthe recommendation on augmented KGs, we introduce a novel subgraph reasoning\nmodule that effectively measures the importance of nodes and discovers\nreasoning for recommendation. Finally, these reasoning paths are fed into the\nLLMs to generate interpretable explanations of the recommendation results. Our\napproach significantly enhances both the effectiveness and interpretability of\nrecommender systems, especially in cross-selling scenarios where traditional\nmethods falter. The effectiveness of our approach has been rigorously tested on\nfour open real-world datasets, with our methods demonstrating a superior\nperformance over contemporary state-of-the-art techniques by an average\nimprovement of 12%. The application of our model in a multinational engineering\nand technology company cross-selling recommendation system further underscores\nits practical utility and potential to redefine recommendation practices\nthrough improved accuracy and user trust.\n","authors":["Guangsi Shi","Xiaofeng Deng","Linhao Luo","Lijuan Xia","Lei Bao","Bei Ye","Fei Du","Shirui Pan","Yuxiao Li"],"pdf_url":"https://arxiv.org/pdf/2406.15859v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00834v1","updated":"2024-06-30T21:07:11Z","published":"2024-06-30T21:07:11Z","title":"Prediction of Sentinel-2 multi-band imagery with attention BiLSTM for\n continuous earth surface monitoring","summary":" Continuous monitoring of crops and forecasting crop conditions through time\nseries analysis is crucial for effective agricultural management. This study\nproposes a framework based on an attention Bidirectional Long Short-Term Memory\n(BiLSTM) network for predicting multiband images. Our model can forecast target\nimages on user-defined dates, including future dates and periods characterized\nby persistent cloud cover. By focusing on short sequences within a\nsequence-to-one forecasting framework, the model leverages advanced attention\nmechanisms to enhance prediction accuracy. Our experimental results demonstrate\nthe model's superior performance in predicting NDVI, multiple vegetation\nindices, and all Sentinel-2 bands, highlighting its potential for improving\nremote sensing data continuity and reliability.\n","authors":["Weiying Zhao","Natalia Efremova"],"pdf_url":"https://arxiv.org/pdf/2407.00834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00787v1","updated":"2024-06-30T18:04:16Z","published":"2024-06-30T18:04:16Z","title":"Enhancing Travel Decision-Making: A Contrastive Learning Approach for\n Personalized Review Rankings in Accommodations","summary":" User-generated reviews significantly influence consumer decisions,\nparticularly in the travel domain when selecting accommodations. This paper\ncontribution comprising two main elements. Firstly, we present a novel dataset\nof authentic guest reviews sourced from a prominent online travel platform,\ntotaling over two million reviews from 50,000 distinct accommodations.\nSecondly, we propose an innovative approach for personalized review ranking.\nOur method employs contrastive learning to intricately capture the relationship\nbetween a review and the contextual information of its respective reviewer.\nThrough a comprehensive experimental study, we demonstrate that our approach\nsurpasses several baselines across all reported metrics. Augmented by a\ncomparative analysis, we showcase the efficacy of our method in elevating\npersonalized review ranking. The implications of our research extend beyond the\ntravel domain, with potential applications in other sectors where personalized\nreview ranking is paramount, such as online e-commerce platforms.\n","authors":["Reda Igebaria","Eran Fainman","Sarai Mizrachi","Moran Beladev","Fengjun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.00787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00635v1","updated":"2024-06-30T09:25:42Z","published":"2024-06-30T09:25:42Z","title":"Dense Retrieval with Continuous Explicit Feedback for Systematic Review\n Screening Prioritisation","summary":" The goal of screening prioritisation in systematic reviews is to identify\nrelevant documents with high recall and rank them in early positions for\nreview. This saves reviewing effort if paired with a stopping criterion, and\nspeeds up review completion if performed alongside downstream tasks. Recent\nstudies have shown that neural models have good potential on this task, but\ntheir time-consuming fine-tuning and inference discourage their widespread use\nfor screening prioritisation. In this paper, we propose an alternative approach\nthat still relies on neural models, but leverages dense representations and\nrelevance feedback to enhance screening prioritisation, without the need for\ncostly model fine-tuning and inference. This method exploits continuous\nrelevance feedback from reviewers during document screening to efficiently\nupdate the dense query representation, which is then applied to rank the\nremaining documents to be screened. We evaluate this approach across the CLEF\nTAR datasets for this task. Results suggest that the investigated dense\nquery-driven approach is more efficient than directly using neural models and\nshows promising effectiveness compared to previous methods developed on the\nconsidered datasets. Our code is available at\nhttps://github.com/ielab/dense-screening-feedback.\n","authors":["Xinyu Mao","Shengyao Zhuang","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2407.00635v1.pdf","comment":"Accepted at SIGIR 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2403.19708v3","updated":"2024-06-30T23:50:38Z","published":"2024-03-23T10:42:49Z","title":"Cost-Efficient Large Language Model Serving for Multi-turn Conversations\n with CachedAttention","summary":" Interacting with humans through multi-turn conversations is a fundamental\nfeature of large language models (LLMs). However, existing LLM serving engines\nexecuting multi-turn conversations are inefficient due to the need to\nrepeatedly compute the key-value (KV) caches of historical tokens, incurring\nhigh serving costs. To address the problem, this paper proposes\nCachedAttention, a new attention mechanism that enables reuse of KV caches\nacross multi-turn conversations, significantly reducing the repetitive\ncomputation overheads. CachedAttention maintains a hierarchical KV caching\nsystem that leverages cost-effective memory/storage mediums to save KV caches\nfor all requests. To reduce KV cache access overheads from slow mediums,\nCachedAttention employs layer-wise pre-loading and asynchronous saving schemes\nto overlap the KV cache access with the GPU computation. To ensure that the KV\ncaches to be accessed are placed in the fastest hierarchy, CachedAttention\nemploys scheduler-aware fetching and eviction schemes to consciously place the\nKV caches in different layers based on the hints from the inference job\nscheduler. To avoid the invalidation of the saved KV caches incurred by context\nwindow overflow, CachedAttention enables the saved KV caches to remain valid\nvia decoupling the positional encoding and effectively truncating the KV\ncaches. Extensive experimental results demonstrate that CachedAttention\nsignificantly decreases the time to the first token (TTFT) by up to 87%,\nimproves the prompt prefilling throughput by up to 7.8$\\times$ for multi-turn\nconversations, and reduces the end-to-end inference cost by up to 70%.\n","authors":["Bin Gao","Zhuomin He","Puru Sharma","Qingxuan Kang","Djordje Jevdjic","Junbo Deng","Xingkun Yang","Zhou Yu","Pengfei Zuo"],"pdf_url":"https://arxiv.org/pdf/2403.19708v3.pdf","comment":"Accepted to USENIX Annual Technical Conference (ATC) 2024"},{"id":"http://arxiv.org/abs/2208.00287v4","updated":"2024-06-30T22:46:54Z","published":"2022-07-30T18:29:11Z","title":"Simplex Clustering via sBeta with Applications to Online Adjustment of\n Black-Box Predictions","summary":" We explore clustering the softmax predictions of deep neural networks and\nintroduce a novel probabilistic clustering method, referred to as k-sBetas. In\nthe general context of clustering discrete distributions, the existing methods\nfocused on exploring distortion measures tailored to simplex data, such as the\nKL divergence, as alternatives to the standard Euclidean distance. We provide a\ngeneral maximum a posteriori (MAP) perspective of clustering distributions,\nemphasizing that the statistical models underlying the existing\ndistortion-based methods may not be descriptive enough. Instead, we optimize a\nmixed-variable objective measuring data conformity within each cluster to the\nintroduced sBeta density function, whose parameters are constrained and\nestimated jointly with binary assignment variables. Our versatile formulation\napproximates various parametric densities for modeling simplex data and enables\nthe control of the cluster-balance bias. This yields highly competitive\nperformances for the unsupervised adjustment of black-box model predictions in\nvarious scenarios. Our code and comparisons with the existing\nsimplex-clustering approaches and our introduced softmax-prediction benchmarks\nare publicly available:\nhttps://github.com/fchiaroni/Clustering_Softmax_Predictions.\n","authors":["Florent Chiaroni","Malik Boudiaf","Amar Mitiche","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2208.00287v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12023v3","updated":"2024-06-30T22:43:35Z","published":"2023-11-20T18:57:41Z","title":"LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient\n Language Model Finetuning","summary":" We propose a simple approach for memory-efficient adaptation of pretrained\nlanguage models. Our approach uses an iterative algorithm to decompose each\npretrained matrix into a high-precision low-rank component and a\nmemory-efficient quantized component. During finetuning, the quantized\ncomponent remains fixed and only the low-rank component is updated. We present\nan integer linear programming formulation of the quantization component which\nenables dynamic configuration of quantization parameters (e.g., bit-width,\nblock size) for each matrix given an overall target memory budget. We further\nexplore a data-aware version of the algorithm which uses an approximation of\nthe Fisher information matrix to weight the reconstruction objective during\nmatrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and\n70B) demonstrate that our low-rank plus quantized matrix decomposition approach\n(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables\naggressive quantization to sub-3 bits with only minor performance degradations.\nWhen finetuned on a language modeling calibration dataset, LQ-LoRA can also be\nused for model compression; in this setting our 2.75-bit LLaMA-2-70B model\n(which has 2.85 bits on average when including the low-rank components and\nrequires 27GB of GPU memory) performs respectably compared to the 16-bit\nbaseline.\n","authors":["Han Guo","Philip Greengard","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12023v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04715v2","updated":"2024-06-30T21:37:56Z","published":"2024-05-07T23:37:40Z","title":"Causality Pursuit from Heterogeneous Environments via Neural Adversarial\n Invariance Learning","summary":" Pursuing causality from data is a fundamental problem in scientific\ndiscovery, treatment intervention, and transfer learning. This paper introduces\na novel algorithmic method for addressing nonparametric invariance and\ncausality learning in regression models across multiple environments, where the\njoint distribution of response variables and covariates varies, but the\nconditional expectations of outcome given an unknown set of quasi-causal\nvariables are invariant. The challenge of finding such an unknown set of\nquasi-causal or invariant variables is compounded by the presence of endogenous\nvariables that have heterogeneous effects across different environments,\nincluding even one of them in the regression would make the estimation\ninconsistent. The proposed Focused Adversial Invariant Regularization (FAIR)\nframework utilizes an innovative minimax optimization approach that breaks down\nthe barriers, driving regression models toward prediction-invariant solutions\nthrough adversarial testing. Leveraging the representation power of neural\nnetworks, FAIR neural networks (FAIR-NN) are introduced for causality pursuit.\nIt is shown that FAIR-NN can find the invariant variables and quasi-causal\nvariables under a minimal identification condition and that the resulting\nprocedure is adaptive to low-dimensional composition structures in a\nnon-asymptotic analysis. Under a structural causal model, variables identified\nby FAIR-NN represent pragmatic causality and provably align with exact causal\nmechanisms under conditions of sufficient heterogeneity. Computationally,\nFAIR-NN employs a novel Gumbel approximation with decreased temperature and\nstochastic gradient descent ascent algorithm. The procedures are convincingly\ndemonstrated using simulated and real-data examples.\n","authors":["Yihong Gu","Cong Fang","Peter Bühlmann","Jianqing Fan"],"pdf_url":"https://arxiv.org/pdf/2405.04715v2.pdf","comment":"48 pages, 7 figures with appendix"},{"id":"http://arxiv.org/abs/2305.19476v2","updated":"2024-06-30T19:27:07Z","published":"2023-05-31T01:09:28Z","title":"Accelerating Reinforcement Learning with Value-Conditional State Entropy\n Exploration","summary":" A promising technique for exploration is to maximize the entropy of visited\nstate distribution, i.e., state entropy, by encouraging uniform coverage of\nvisited state space. While it has been effective for an unsupervised setup, it\ntends to struggle in a supervised setup with a task reward, where an agent\nprefers to visit high-value states to exploit the task reward. Such a\npreference can cause an imbalance between the distributions of high-value\nstates and low-value states, which biases exploration towards low-value state\nregions as a result of the state entropy increasing when the distribution\nbecomes more uniform. This issue is exacerbated when high-value states are\nnarrowly distributed within the state space, making it difficult for the agent\nto complete the tasks. In this paper, we present a novel exploration technique\nthat maximizes the value-conditional state entropy, which separately estimates\nthe state entropies that are conditioned on the value estimates of each state,\nthen maximizes their average. By only considering the visited states with\nsimilar value estimates for computing the intrinsic bonus, our method prevents\nthe distribution of low-value states from affecting exploration around\nhigh-value states, and vice versa. We demonstrate that the proposed alternative\nto the state entropy baseline significantly accelerates various reinforcement\nlearning algorithms across a variety of tasks within MiniGrid, DeepMind Control\nSuite, and Meta-World benchmarks. Source code is available at\nhttps://sites.google.com/view/rl-vcse.\n","authors":["Dongyoung Kim","Jinwoo Shin","Pieter Abbeel","Younggyo Seo"],"pdf_url":"https://arxiv.org/pdf/2305.19476v2.pdf","comment":"NeurIPS 2024. Project webpage: https://sites.google.com/view/rl-vcse"},{"id":"http://arxiv.org/abs/2311.04886v2","updated":"2024-06-30T18:53:22Z","published":"2023-11-08T18:46:32Z","title":"SEMQA: Semi-Extractive Multi-Source Question Answering","summary":" Recently proposed long-form question answering (QA) systems, supported by\nlarge language models (LLMs), have shown promising capabilities. Yet,\nattributing and verifying their generated abstractive answers can be difficult,\nand automatically evaluating their accuracy remains an ongoing challenge.\n In this work, we introduce a new QA task for answering multi-answer questions\nby summarizing multiple diverse sources in a semi-extractive fashion.\nSpecifically, Semi-extractive Multi-source QA (SEMQA) requires models to output\na comprehensive answer, while mixing factual quoted spans -- copied verbatim\nfrom given input sources -- and non-factual free-text connectors that glue\nthese spans together into a single cohesive passage. This setting bridges the\ngap between the outputs of well-grounded but constrained extractive QA systems\nand more fluent but harder to attribute fully abstractive answers.\nParticularly, it enables a new mode for language models that leverages their\nadvanced language generation capabilities, while also producing fine in-line\nattributions by-design that are easy to verify, interpret, and evaluate.\n To study this task, we create the first dataset of this kind, QuoteSum, with\nhuman-written semi-extractive answers to natural and generated questions, and\ndefine text-based evaluation metrics. Experimenting with several LLMs in\nvarious settings, we find this task to be surprisingly challenging,\ndemonstrating the importance of QuoteSum for developing and studying such\nconsolidation capabilities.\n","authors":["Tal Schuster","Adam D. Lelkes","Haitian Sun","Jai Gupta","Jonathan Berant","William W. Cohen","Donald Metzler"],"pdf_url":"https://arxiv.org/pdf/2311.04886v2.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2404.01216v2","updated":"2024-06-30T18:40:10Z","published":"2024-04-01T16:16:19Z","title":"Novel Node Category Detection Under Subpopulation Shift","summary":" In real-world graph data, distribution shifts can manifest in various ways,\nsuch as the emergence of new categories and changes in the relative proportions\nof existing categories. It is often important to detect nodes of novel\ncategories under such distribution shifts for safety or insight discovery\npurposes. We introduce a new approach, Recall-Constrained Optimization with\nSelective Link Prediction (RECO-SLIP), to detect nodes belonging to novel\ncategories in attributed graphs under subpopulation shifts. By integrating a\nrecall-constrained learning framework with a sample-efficient link prediction\nmechanism, RECO-SLIP addresses the dual challenges of resilience against\nsubpopulation shifts and the effective exploitation of graph structure. Our\nextensive empirical evaluation across multiple graph datasets demonstrates the\nsuperior performance of RECO-SLIP over existing methods. The experimental code\nis available at https://github.com/hsinghuan/novel-node-category-detection.\n","authors":["Hsing-Huan Chung","Shravan Chaudhari","Yoav Wald","Xing Han","Joydeep Ghosh"],"pdf_url":"https://arxiv.org/pdf/2404.01216v2.pdf","comment":"Accepted to ECML-PKDD 2024"},{"id":"http://arxiv.org/abs/2405.18723v2","updated":"2024-06-30T17:01:51Z","published":"2024-05-29T03:08:30Z","title":"Conformal Depression Prediction","summary":" While existing depression prediction methods based on deep learning show\npromise, their practical application is hindered by the lack of\ntrustworthiness, as these deep models are often deployed as \\textit{black box}\nmodels, leaving us uncertain about the confidence of the model predictions. For\nhigh-risk clinical applications like depression prediction, uncertainty\nquantification is essential in decision-making. In this paper, we introduce\nconformal depression prediction (CDP), a depression prediction method with\nuncertainty quantification based on conformal prediction (CP), giving valid\nconfidence intervals with theoretical coverage guarantees for the model\npredictions. CDP is a plug-and-play module that requires neither model\nretraining nor an assumption about the depression data distribution. As CDP\nprovides only an average coverage guarantee across all inputs rather than\nper-input performance guarantee, we further propose CDP-ACC, an improved\nconformal prediction with approximate conditional coverage. CDP-ACC firstly\nestimates the prediction distribution through neighborhood relaxation, and then\nintroduces a conformal score function by constructing nested sequences, so as\nto provide a tighter prediction interval for each specific input. We\nempirically demonstrate the application of CDP in uncertainty-aware depression\nprediction, as well as the effectiveness and superiority of CDP-ACC on the AVEC\n2013 and AVEC 2014 datasets.\n","authors":["Yonghong Li","Shan Qu","Xiuzhuang Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.18723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03220v3","updated":"2024-06-30T17:01:45Z","published":"2024-02-05T17:30:42Z","title":"The Benefits of Reusing Batches for Gradient Descent in Two-Layer\n Networks: Breaking the Curse of Information and Leap Exponents","summary":" We investigate the training dynamics of two-layer neural networks when\nlearning multi-index target functions. We focus on multi-pass gradient descent\n(GD) that reuses the batches multiple times and show that it significantly\nchanges the conclusion about which functions are learnable compared to\nsingle-pass gradient descent. In particular, multi-pass GD with finite stepsize\nis found to overcome the limitations of gradient flow and single-pass GD given\nby the information exponent (Ben Arous et al., 2021) and leap exponent (Abbe et\nal., 2023) of the target function. We show that upon re-using batches, the\nnetwork achieves in just two time steps an overlap with the target subspace\neven for functions not satisfying the staircase property (Abbe et al., 2021).\nWe characterize the (broad) class of functions efficiently learned in finite\ntime. The proof of our results is based on the analysis of the Dynamical\nMean-Field Theory (DMFT). We further provide a closed-form description of the\ndynamical process of the low-dimensional projections of the weights, and\nnumerical experiments illustrating the theory.\n","authors":["Yatin Dandi","Emanuele Troiani","Luca Arnaboldi","Luca Pesce","Lenka Zdeborová","Florent Krzakala"],"pdf_url":"https://arxiv.org/pdf/2402.03220v3.pdf","comment":"Accepted at the International Conference on Machine Learning (ICML),\n 2024"},{"id":"http://arxiv.org/abs/2405.03180v2","updated":"2024-06-30T16:18:30Z","published":"2024-05-06T06:05:41Z","title":"Braced Fourier Continuation and Regression for Anomaly Detection","summary":" In this work, the concept of Braced Fourier Continuation and Regression\n(BFCR) is introduced. BFCR is a novel and computationally efficient means of\nfinding nonlinear regressions or trend lines in arbitrary one-dimensional data\nsets. The Braced Fourier Continuation (BFC) and BFCR algorithms are first\noutlined, followed by a discussion of the properties of BFCR as well as\ndemonstrations of how BFCR trend lines may be used effectively for anomaly\ndetection both within and at the edges of arbitrary one-dimensional data sets.\nFinally, potential issues which may arise while using BFCR for anomaly\ndetection as well as possible mitigation techniques are outlined and discussed.\nAll source code and example data sets are either referenced or available via\nGitHub, and all associated code is written entirely in Python.\n","authors":["Josef Sabuda"],"pdf_url":"https://arxiv.org/pdf/2405.03180v2.pdf","comment":"16 pages, 9 figures, associated Github link:\n https://github.com/j4sabuda/Braced-Fourier-Continuation-and-Regression\n -6/30/2024 update corrected and reworded erroneous figure references, minor\n typos"},{"id":"http://arxiv.org/abs/2210.13193v3","updated":"2024-06-30T16:07:10Z","published":"2022-10-24T13:10:06Z","title":"Langevin dynamics based algorithm e-TH$\\varepsilon$O POULA for\n stochastic optimization problems with discontinuous stochastic gradient","summary":" We introduce a new Langevin dynamics based algorithm, called\ne-TH$\\varepsilon$O POULA, to solve optimization problems with discontinuous\nstochastic gradients which naturally appear in real-world applications such as\nquantile estimation, vector quantization, CVaR minimization, and regularized\noptimization problems involving ReLU neural networks. We demonstrate both\ntheoretically and numerically the applicability of the e-TH$\\varepsilon$O POULA\nalgorithm. More precisely, under the conditions that the stochastic gradient is\nlocally Lipschitz in average and satisfies a certain convexity at infinity\ncondition, we establish non-asymptotic error bounds for e-TH$\\varepsilon$O\nPOULA in Wasserstein distances and provide a non-asymptotic estimate for the\nexpected excess risk, which can be controlled to be arbitrarily small. Three\nkey applications in finance and insurance are provided, namely, multi-period\nportfolio optimization, transfer learning in multi-period portfolio\noptimization, and insurance claim prediction, which involve neural networks\nwith (Leaky)-ReLU activation functions. Numerical experiments conducted using\nreal-world datasets illustrate the superior empirical performance of\ne-TH$\\varepsilon$O POULA compared to SGLD, TUSLA, ADAM, and AMSGrad in terms of\nmodel accuracy.\n","authors":["Dong-Young Lim","Ariel Neufeld","Sotirios Sabanis","Ying Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.13193v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07203v2","updated":"2024-06-30T15:12:45Z","published":"2023-04-14T15:29:20Z","title":"On the convergence of nonlinear averaging dynamics with three-body\n interactions on hypergraphs","summary":" Complex networked systems in fields such as physics, biology, and social\nsciences often involve interactions that extend beyond simple pairwise ones.\nHypergraphs serve as powerful modeling tools for describing and analyzing the\nintricate behaviors of systems with multi-body interactions. Herein, we\ninvestigate a discrete-time nonlinear averaging dynamics with three-body\ninteractions: an underlying hypergraph, comprising triples as hyperedges,\ndelineates the structure of these interactions, while the vertices update their\nstates through a weighted, state-dependent average of neighboring pairs'\nstates. This dynamics captures reinforcing group effects, such as peer\npressure, and exhibits higher-order dynamical effects resulting from a complex\ninterplay between initial states, hypergraph topology, and nonlinearity of the\nupdate. Differently from linear averaging dynamics on graphs with two-body\ninteractions, this model does not converge to the average of the initial states\nbut rather induces a shift. By assuming random initial states and by making\nsome regularity and density assumptions on the hypergraph, we prove that the\ndynamics converges to a multiplicatively-shifted average of the initial states,\nwith high probability. We further characterize the shift as a function of two\nparameters describing the initial state and interaction strength, as well as\nthe convergence time as a function of the hypergraph structure.\n","authors":["Emilio Cruciani","Emanuela L. Giacomelli","Jinyeop Lee"],"pdf_url":"https://arxiv.org/pdf/2304.07203v2.pdf","comment":"To appear in SIAM Journal on Applied Dynamical Systems"},{"id":"http://arxiv.org/abs/2406.16858v2","updated":"2024-06-30T15:03:25Z","published":"2024-06-24T17:59:11Z","title":"EAGLE-2: Faster Inference of Language Models with Dynamic Draft Trees","summary":" Inference with modern Large Language Models (LLMs) is expensive and\ntime-consuming, and speculative sampling has proven to be an effective\nsolution. Most speculative sampling methods such as EAGLE use a static draft\ntree, implicitly assuming that the acceptance rate of draft tokens depends only\non their position. Interestingly, we found that the acceptance rate of draft\ntokens is also context-dependent. In this paper, building upon EAGLE, we\npropose EAGLE-2, which introduces a new technique of context-aware dynamic\ndraft tree into drafting modeling. This improvement leverages the fact that the\ndraft model of EAGLE is well-calibrated: the confidence scores from the draft\nmodel approximate acceptance rates with small errors. We conducted extensive\nevaluations on three series of LLMs and six tasks, with EAGLE-2 achieving\nspeedup ratios 3.05x-4.26x, which is 20%-40% faster than EAGLE-1. EAGLE-2 also\nensures that the distribution of the generated text remains unchanged, making\nit a lossless acceleration algorithm.\n","authors":["Yuhui Li","Fangyun Wei","Chao Zhang","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.16858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15649v2","updated":"2024-06-30T14:28:38Z","published":"2023-11-27T09:20:23Z","title":"RoboGPT: an intelligent agent of making embodied long-term decisions for\n daily instruction tasks","summary":" Robotic agents must master common sense and long-term sequential decisions to\nsolve daily tasks through natural language instruction. The developments in\nLarge Language Models (LLMs) in natural language processing have inspired\nefforts to use LLMs in complex robot planning. Despite LLMs' great\ngeneralization and comprehension of instruction tasks, LLMs-generated task\nplans sometimes lack feasibility and correctness. To address the problem, we\npropose a RoboGPT agent\\footnote{our code and dataset will be released soon}\nfor making embodied long-term decisions for daily tasks, with two modules: 1)\nLLMs-based planning with re-plan to break the task into multiple sub-goals; 2)\nRoboSkill individually designed for sub-goals to learn better navigation and\nmanipulation skills. The LLMs-based planning is enhanced with a new robotic\ndataset and re-plan, called RoboGPT. The new robotic dataset of 67k daily\ninstruction tasks is gathered for fine-tuning the Llama model and obtaining\nRoboGPT. RoboGPT planner with strong generalization can plan hundreds of daily\ninstruction tasks. Additionally, a low-computational Re-Plan module is designed\nto allow plans to flexibly adapt to the environment, thereby addressing the\nnomenclature diversity challenge. The proposed RoboGPT agent outperforms SOTA\nmethods on the ALFRED daily tasks. Moreover, RoboGPT planner exceeds SOTA\nLLM-based planners like ChatGPT in task-planning rationality for hundreds of\nunseen daily tasks, and even other domain tasks, while keeping the large\nmodel's original broad application and generality.\n","authors":["Yaran Chen","Wenbo Cui","Yuanwen Chen","Mining Tan","Xinyao Zhang","Dongbin Zhao","He Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15649v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04924v3","updated":"2024-06-30T13:52:18Z","published":"2024-02-07T14:49:10Z","title":"Two Trades is not Baffled: Condensing Graph via Crafting Rational\n Gradient Matching","summary":" Training on large-scale graphs has achieved remarkable results in graph\nrepresentation learning, but its cost and storage have raised growing concerns.\nAs one of the most promising directions, graph condensation methods address\nthese issues by employing gradient matching, aiming to condense the full graph\ninto a more concise yet information-rich synthetic set. Though encouraging,\nthese strategies primarily emphasize matching directions of the gradients,\nwhich leads to deviations in the training trajectories. Such deviations are\nfurther magnified by the differences between the condensation and evaluation\nphases, culminating in accumulated errors, which detrimentally affect the\nperformance of the condensed graphs. In light of this, we propose a novel graph\ncondensation method named \\textbf{C}raf\\textbf{T}ing \\textbf{R}ationa\\textbf{L}\ntrajectory (\\textbf{CTRL}), which offers an optimized starting point closer to\nthe original dataset's feature distribution and a more refined strategy for\ngradient matching. Theoretically, CTRL can effectively neutralize the impact of\naccumulated errors on the performance of condensed graphs. We provide extensive\nexperiments on various graph datasets and downstream tasks to support the\neffectiveness of CTRL. Code is released at\nhttps://github.com/NUS-HPC-AI-Lab/CTRL.\n","authors":["Tianle Zhang","Yuchen Zhang","Kun Wang","Kai Wang","Beining Yang","Kaipeng Zhang","Wenqi Shao","Ping Liu","Joey Tianyi Zhou","Yang You"],"pdf_url":"https://arxiv.org/pdf/2402.04924v3.pdf","comment":"An effective method for graph condensation"},{"id":"http://arxiv.org/abs/2312.12112v3","updated":"2024-06-30T12:48:18Z","published":"2023-12-19T12:34:46Z","title":"Curated LLM: Synergy of LLMs and Data Curation for tabular augmentation\n in low-data regimes","summary":" Machine Learning (ML) in low-data settings remains an underappreciated yet\ncrucial problem. Hence, data augmentation methods to increase the sample size\nof datasets needed for ML are key to unlocking the transformative potential of\nML in data-deprived regions and domains. Unfortunately, the limited training\nset constrains traditional tabular synthetic data generators in their ability\nto generate a large and diverse augmented dataset needed for ML tasks. To\naddress this challenge, we introduce CLLM, which leverages the prior knowledge\nof Large Language Models (LLMs) for data augmentation in the low-data regime.\nHowever, not all the data generated by LLMs will improve downstream utility, as\nfor any generative model. Consequently, we introduce a principled curation\nmechanism, leveraging learning dynamics, coupled with confidence and\nuncertainty metrics, to obtain a high-quality dataset. Empirically, on multiple\nreal-world datasets, we demonstrate the superior performance of CLLM in the\nlow-data regime compared to conventional generators. Additionally, we provide\ninsights into the LLM generation and curation mechanism, shedding light on the\nfeatures that enable them to output high-quality augmented datasets.\n","authors":["Nabeel Seedat","Nicolas Huynh","Boris van Breugel","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2312.12112v3.pdf","comment":"Presented at the 41st International Conference on Machine Learning\n (ICML) 2024. *Seedat & Huynh contributed equally"},{"id":"http://arxiv.org/abs/2406.00873v2","updated":"2024-06-30T12:12:23Z","published":"2024-06-02T21:40:13Z","title":"Scaffold Splits Overestimate Virtual Screening Performance","summary":" Virtual Screening (VS) of vast compound libraries guided by Artificial\nIntelligence (AI) models is a highly productive approach to early drug\ndiscovery. Data splitting is crucial for better benchmarking of such AI models.\nTraditional random data splits produce similar molecules between training and\ntest sets, conflicting with the reality of VS libraries which mostly contain\nstructurally distinct compounds. Scaffold split, grouping molecules by shared\ncore structure, is widely considered to reflect this real-world scenario.\nHowever, here we show that the scaffold split also overestimates VS\nperformance. The reason is that molecules with different chemical scaffolds are\noften similar, which hence introduces unrealistically high similarities between\ntraining molecules and test molecules following a scaffold split. Our study\nexamined three representative AI models on 60 NCI-60 datasets, each with\napproximately 30,000 to 50,000 molecules tested on a different cancer cell\nline. Each dataset was split with three methods: scaffold, Butina clustering\nand the more accurate Uniform Manifold Approximation and Projection (UMAP)\nclustering. Regardless of the model, model performance is much worse with UMAP\nsplits from the results of the 2100 models trained and evaluated for each\nalgorithm and split. These robust results demonstrate the need for more\nrealistic data splits to tune, compare, and select models for VS. For the same\nreason, avoiding the scaffold split is also recommended for other molecular\nproperty prediction problems. The code to reproduce these results is available\nat https://github.com/ScaffoldSplitsOverestimateVS\n","authors":["Qianrong Guo","Saiveth Hernandez-Hernandez","Pedro J Ballester"],"pdf_url":"https://arxiv.org/pdf/2406.00873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06330v2","updated":"2024-06-30T11:19:59Z","published":"2024-05-10T09:03:12Z","title":"Interpretable Multi-task Learning with Shared Variable Embeddings","summary":" This paper proposes a general interpretable predictive system with shared\ninformation. The system is able to perform predictions in a multi-task setting\nwhere distinct tasks are not bound to have the same input/output structure.\nEmbeddings of input and output variables in a common space are obtained, where\nthe input embeddings are produced through attending to a set of shared\nembeddings, reused across tasks. All the embeddings are treated as model\nparameters and learned. Specific restrictions on the space of shared embedings\nand the sparsity of the attention mechanism are considered. Experiments show\nthat the introduction of shared embeddings does not deteriorate the results\nobtained from a vanilla variable embeddings method. We run a number of further\nablations. Inducing sparsity in the attention mechanism leads to both an\nincrease in accuracy and a significant decrease in the number of training steps\nrequired. Shared embeddings provide a measure of interpretability in terms of\nboth a qualitative assessment and the ability to map specific shared embeddings\nto pre-defined concepts that are not tailored to the considered model. There\nseems to be a trade-off between accuracy and interpretability. The basic shared\nembeddings method favors interpretability, whereas the sparse attention method\npromotes accuracy. The results lead to the conclusion that variable embedding\nmethods may be extended with shared information to provide increased\ninterpretability and accuracy.\n","authors":["Maciej Żelaszczyk","Jacek Mańdziuk"],"pdf_url":"https://arxiv.org/pdf/2405.06330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11566v3","updated":"2024-06-30T10:13:42Z","published":"2023-10-17T20:25:40Z","title":"Partially Observable Stochastic Games with Neural Perception Mechanisms","summary":" Stochastic games are a well established model for multi-agent sequential\ndecision making under uncertainty. In practical applications, though, agents\noften have only partial observability of their environment. Furthermore, agents\nincreasingly perceive their environment using data-driven approaches such as\nneural networks trained on continuous data. We propose the model of\nneuro-symbolic partially-observable stochastic games (NS-POSGs), a variant of\ncontinuous-space concurrent stochastic games that explicitly incorporates\nneural perception mechanisms. We focus on a one-sided setting with a\npartially-informed agent using discrete, data-driven observations and another,\nfully-informed agent. We present a new method, called one-sided NS-HSVI, for\napproximate solution of one-sided NS-POSGs, which exploits the piecewise\nconstant structure of the model. Using neural network pre-image analysis to\nconstruct finite polyhedral representations and particle-based representations\nfor beliefs, we implement our approach and illustrate its practical\napplicability to the analysis of pedestrian-vehicle and pursuit-evasion\nscenarios.\n","authors":["Rui Yan","Gabriel Santos","Gethin Norman","David Parker","Marta Kwiatkowska"],"pdf_url":"https://arxiv.org/pdf/2310.11566v3.pdf","comment":"42 pages, 6 figures. Extended version of paper to be published in FM\n 2024"},{"id":"http://arxiv.org/abs/2403.16986v2","updated":"2024-06-30T10:03:09Z","published":"2024-03-25T17:48:06Z","title":"Dynamic Relative Representations for Goal-Oriented Semantic\n Communications","summary":" In future 6G wireless networks, semantic and effectiveness aspects of\ncommunications will play a fundamental role, incorporating meaning and\nrelevance into transmissions. However, obstacles arise when devices employ\ndiverse languages, logic, or internal representations, leading to semantic\nmismatches that might jeopardize understanding. In latent space communication,\nthis challenge manifests as misalignment within high-dimensional\nrepresentations where deep neural networks encode data. This paper presents a\nnovel framework for goal-oriented semantic communication, leveraging relative\nrepresentations to mitigate semantic mismatches via latent space alignment. We\npropose a dynamic optimization strategy that adapts relative representations,\ncommunication parameters, and computation resources for energy-efficient,\nlow-latency, goal-oriented semantic communications. Numerical results\ndemonstrate our methodology's effectiveness in mitigating mismatches among\ndevices, while optimizing energy consumption, delay, and effectiveness.\n","authors":["Simone Fiorellino","Claudio Battiloro","Emilio Calvanese Strinati","Paolo Di Lorenzo"],"pdf_url":"https://arxiv.org/pdf/2403.16986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08364v3","updated":"2024-06-30T09:50:11Z","published":"2023-11-14T18:14:56Z","title":"Plum: Prompt Learning using Metaheuristic","summary":" Since the emergence of large language models, prompt learning has become a\npopular method for optimizing and customizing these models. Special prompts,\nsuch as Chain-of-Thought, have even revealed previously unknown reasoning\ncapabilities within these models. However, the progress of discovering\neffective prompts has been slow, driving a desire for general prompt\noptimization methods. Unfortunately, few existing prompt learning methods\nsatisfy the criteria of being truly \"general\", i.e., automatic, discrete,\nblack-box, gradient-free, and interpretable all at once. In this paper, we\nintroduce metaheuristics, a branch of discrete non-convex optimization methods\nwith over 100 options, as a promising approach to prompt learning. Within our\nparadigm, we test six typical methods: hill climbing, simulated annealing,\ngenetic algorithms with/without crossover, tabu search, and harmony search,\ndemonstrating their effectiveness in white-box and black-box prompt learning.\nFurthermore, we show that these methods can be used to discover more\nhuman-understandable prompts that were previously unknown in both reasoning and\nimage generation tasks, opening the door to a cornucopia of possibilities in\nprompt optimization. We release all the codes in\n\\url{https://github.com/research4pan/Plum}.\n","authors":["Rui Pan","Shuo Xing","Shizhe Diao","Wenhe Sun","Xiang Liu","Kashun Shum","Renjie Pi","Jipeng Zhang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.08364v3.pdf","comment":"Published at Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2402.02956v4","updated":"2024-06-30T09:08:55Z","published":"2024-02-05T12:34:03Z","title":"AdaTreeFormer: Few Shot Domain Adaptation for Tree Counting from a\n Single High-Resolution Image","summary":" The process of estimating and counting tree density using only a single\naerial or satellite image is a difficult task in the fields of photogrammetry\nand remote sensing. However, it plays a crucial role in the management of\nforests. The huge variety of trees in varied topography severely hinders tree\ncounting models to perform well. The purpose of this paper is to propose a\nframework that is learnt from the source domain with sufficient labeled trees\nand is adapted to the target domain with only a limited number of labeled\ntrees. Our method, termed as AdaTreeFormer, contains one shared encoder with a\nhierarchical feature extraction scheme to extract robust features from the\nsource and target domains. It also consists of three subnets: two for\nextracting self-domain attention maps from source and target domains\nrespectively and one for extracting cross-domain attention maps. For the\nlatter, an attention-to-adapt mechanism is introduced to distill relevant\ninformation from different domains while generating tree density maps; a\nhierarchical cross-domain feature alignment scheme is proposed that\nprogressively aligns the features from the source and target domains. We also\nadopt adversarial learning into the framework to further reduce the gap between\nsource and target domains. Our AdaTreeFormer is evaluated on six designed\ndomain adaptation tasks using three tree counting datasets, \\ie Jiangsu,\nYosemite, and London. Experimental results show that AdaTreeFormer\nsignificantly surpasses the state of the art, \\eg in the cross domain from the\nYosemite to Jiangsu dataset, it achieves a reduction of 15.9 points in terms of\nthe absolute counting errors and an increase of 10.8\\% in the accuracy of the\ndetected trees' locations. The codes and datasets are available at\nhttps://github.com/HAAClassic/AdaTreeFormer.\n","authors":["Hamed Amini Amirkolaee","Miaojing Shi","Lianghua He","Mark Mulligan"],"pdf_url":"https://arxiv.org/pdf/2402.02956v4.pdf","comment":"Accepted in ISPRS Journal of Photogrammetry and Remote Sensing"},{"id":"http://arxiv.org/abs/2401.09870v2","updated":"2024-06-30T09:02:37Z","published":"2024-01-18T10:33:30Z","title":"Reconciling Spatial and Temporal Abstractions for Goal Representation","summary":" Goal representation affects the performance of Hierarchical Reinforcement\nLearning (HRL) algorithms by decomposing the complex learning problem into\neasier subtasks. Recent studies show that representations that preserve\ntemporally abstract environment dynamics are successful in solving difficult\nproblems and provide theoretical guarantees for optimality. These methods\nhowever cannot scale to tasks where environment dynamics increase in complexity\ni.e. the temporally abstract transition relations depend on larger number of\nvariables. On the other hand, other efforts have tried to use spatial\nabstraction to mitigate the previous issues. Their limitations include\nscalability to high dimensional environments and dependency on prior knowledge.\n In this paper, we propose a novel three-layer HRL algorithm that introduces,\nat different levels of the hierarchy, both a spatial and a temporal goal\nabstraction. We provide a theoretical study of the regret bounds of the learned\npolicies. We evaluate the approach on complex continuous control tasks,\ndemonstrating the effectiveness of spatial and temporal abstractions learned by\nthis approach. Find open-source code at https://github.com/cosynus-lix/STAR.\n","authors":["Mehdi Zadem","Sergio Mover","Sao Mai Nguyen"],"pdf_url":"https://arxiv.org/pdf/2401.09870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04062v9","updated":"2024-06-30T08:59:02Z","published":"2023-02-08T13:59:31Z","title":"Machine Learning for Synthetic Data Generation: A Review","summary":" Machine learning heavily relies on data, but real-world applications often\nencounter various data-related issues. These include data of poor quality,\ninsufficient data points leading to under-fitting of machine learning models,\nand difficulties in data access due to concerns surrounding privacy, safety,\nand regulations. In light of these challenges, the concept of synthetic data\ngeneration emerges as a promising alternative that allows for data sharing and\nutilization in ways that real-world data cannot facilitate. This paper presents\na comprehensive systematic review of existing studies that employ machine\nlearning models for the purpose of generating synthetic data. The review\nencompasses various perspectives, starting with the applications of synthetic\ndata generation, spanning computer vision, speech, natural language processing,\nhealthcare, and business domains. Additionally, it explores different machine\nlearning methods, with particular emphasis on neural network architectures and\ndeep generative models. The paper also addresses the crucial aspects of privacy\nand fairness concerns related to synthetic data generation. Furthermore, this\nstudy identifies the challenges and opportunities prevalent in this emerging\nfield, shedding light on the potential avenues for future research. By delving\ninto the intricacies of synthetic data generation, this paper aims to\ncontribute to the advancement of knowledge and inspire further exploration in\nsynthetic data generation.\n","authors":["Yingzhou Lu","Minjie Shen","Huazheng Wang","Xiao Wang","Capucine van Rechem","Tianfan Fu","Wenqi Wei"],"pdf_url":"https://arxiv.org/pdf/2302.04062v9.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.03228v3","updated":"2024-06-30T08:29:48Z","published":"2023-02-07T03:21:55Z","title":"Heterophily-Aware Graph Attention Network","summary":" Graph Neural Networks (GNNs) have shown remarkable success in graph\nrepresentation learning. Unfortunately, current weight assignment schemes in\nstandard GNNs, such as the calculation based on node degrees or pair-wise\nrepresentations, can hardly be effective in processing the networks with\nheterophily, in which the connected nodes usually possess different labels or\nfeatures. Existing heterophilic GNNs tend to ignore the modeling of heterophily\nof each edge, which is also a vital part in tackling the heterophily problem.\nIn this paper, we firstly propose a heterophily-aware attention scheme and\nreveal the benefits of modeling the edge heterophily, i.e., if a GNN assigns\ndifferent weights to edges according to different heterophilic types, it can\nlearn effective local attention patterns, which enable nodes to acquire\nappropriate information from distinct neighbors. Then, we propose a novel\nHeterophily-Aware Graph Attention Network (HA-GAT) by fully exploring and\nutilizing the local distribution as the underlying heterophily, to handle the\nnetworks with different homophily ratios. To demonstrate the effectiveness of\nthe proposed HA-GAT, we analyze the proposed heterophily-aware attention scheme\nand local distribution exploration, by seeking for an interpretation from their\nmechanism. Extensive results demonstrate that our HA-GAT achieves\nstate-of-the-art performances on eight datasets with different homophily ratios\nin both the supervised and semi-supervised node classification tasks.\n","authors":["Junfu Wang","Yuanfang Guo","Liang Yang","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2302.03228v3.pdf","comment":"Accepted by Pattern Recognition"},{"id":"http://arxiv.org/abs/2405.01067v2","updated":"2024-06-30T08:06:33Z","published":"2024-05-02T07:49:28Z","title":"AB-Training: A Communication-Efficient Approach for Distributed Low-Rank\n Learning","summary":" Communication bottlenecks severely hinder the scalability of distributed\nneural network training, particularly in high-performance computing (HPC)\nenvironments. We introduce AB-training, a novel data-parallel method that\nleverages low-rank representations and independent training groups to\nsignificantly reduce communication overhead. Our experiments demonstrate an\naverage reduction in network traffic of approximately 70.31\\% across various\nscaling scenarios, increasing the training potential of\ncommunication-constrained systems and accelerating convergence at scale.\nAB-training also exhibits a pronounced regularization effect at smaller scales,\nleading to improved generalization while maintaining or even reducing training\ntime. We achieve a remarkable 44.14 : 1 compression ratio on VGG16 trained on\nCIFAR-10 with minimal accuracy loss, and outperform traditional data parallel\ntraining by 1.55\\% on ResNet-50 trained on ImageNet-2012. While AB-training is\npromising, our findings also reveal that large batch effects persist even in\nlow-rank regimes, underscoring the need for further research into optimized\nupdate mechanisms for massively distributed training.\n","authors":["Daniel Coquelin","Katherina Flügel","Marie Weiel","Nicholas Kiefer","Muhammed Öz","Charlotte Debus","Achim Streit","Markus Götz"],"pdf_url":"https://arxiv.org/pdf/2405.01067v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19057v2","updated":"2024-06-30T07:54:30Z","published":"2024-06-27T10:08:29Z","title":"Segment Anything Model for automated image data annotation: empirical\n studies using text prompts from Grounding DINO","summary":" Grounding DINO and the Segment Anything Model (SAM) have achieved impressive\nperformance in zero-shot object detection and image segmentation, respectively.\nTogether, they have a great potential to revolutionize applications in\nzero-shot semantic segmentation or data annotation. Yet, in specialized domains\nlike medical image segmentation, objects of interest (e.g., organs, tissues,\nand tumors) may not fall in existing class names. To address this problem, the\nreferring expression comprehension (REC) ability of Grounding DINO is leveraged\nto detect arbitrary targets by their language descriptions. However, recent\nstudies have highlighted severe limitation of the REC framework in this\napplication setting owing to its tendency to make false positive predictions\nwhen the target is absent in the given image. And, while this bottleneck is\ncentral to the prospect of open-set semantic segmentation, it is still largely\nunknown how much improvement can be achieved by studying the prediction errors.\nTo this end, we perform empirical studies on six publicly available datasets\nacross different domains and reveal that these errors consistently follow a\npredictable pattern and can, thus, be mitigated by a simple strategy.\nSpecifically, we show that false positive detections with appreciable\nconfidence scores generally occupy large image areas and can usually be\nfiltered by their relative sizes. More importantly, we expect these\nobservations to inspire future research in improving REC-based detection and\nautomated segmentation. Meanwhile, we evaluate the performance of SAM on\nmultiple datasets from various specialized domains and report significant\nimprovements in segmentation performance and annotation time savings over\nmanual approaches.\n","authors":["Fuseini Mumuni","Alhassan Mumuni"],"pdf_url":"https://arxiv.org/pdf/2406.19057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02024v3","updated":"2024-06-30T07:44:53Z","published":"2024-06-04T07:02:59Z","title":"Verifying the Generalization of Deep Learning to Out-of-Distribution\n Domains","summary":" Deep neural networks (DNNs) play a crucial role in the field of machine\nlearning, demonstrating state-of-the-art performance across various application\ndomains. However, despite their success, DNN-based models may occasionally\nexhibit challenges with generalization, i.e., may fail to handle inputs that\nwere not encountered during training. This limitation is a significant\nchallenge when it comes to deploying deep learning for safety-critical tasks,\nas well as in real-world settings characterized by substantial variability. We\nintroduce a novel approach for harnessing DNN verification technology to\nidentify DNN-driven decision rules that exhibit robust generalization to\npreviously unencountered input domains. Our method assesses generalization\nwithin an input domain by measuring the level of agreement between\nindependently trained deep neural networks for inputs in this domain. We also\nefficiently realize our approach by using off-the-shelf DNN verification\nengines, and extensively evaluate it on both supervised and unsupervised DNN\nbenchmarks, including a deep reinforcement learning (DRL) system for Internet\ncongestion control -- demonstrating the applicability of our approach for\nreal-world settings. Moreover, our research introduces a fresh objective for\nformal verification, offering the prospect of mitigating the challenges linked\nto deploying DNN-driven systems in real-world scenarios.\n","authors":["Guy Amir","Osher Maayan","Tom Zelazny","Guy Katz","Michael Schapira"],"pdf_url":"https://arxiv.org/pdf/2406.02024v3.pdf","comment":"To appear in the Journal of Automated Reasoning (JAR), 2024. This is\n an extended version of a CAV 2023 paper, titled: \"Verifying Generalization in\n Deep Learning\""},{"id":"http://arxiv.org/abs/2312.07981v2","updated":"2024-06-30T07:28:41Z","published":"2023-12-13T08:53:37Z","title":"Time Series Diffusion Method: A Denoising Diffusion Probabilistic Model\n for Vibration Signal Generation","summary":" Diffusion models have demonstrated powerful data generation capabilities in\nvarious research fields such as image generation. However, in the field of\nvibration signal generation, the criteria for evaluating the quality of the\ngenerated signal are different from that of image generation and there is a\nfundamental difference between them. At present, there is no research on the\nability of diffusion model to generate vibration signal. In this paper, a Time\nSeries Diffusion Method (TSDM) is proposed for vibration signal generation,\nleveraging the foundational principles of diffusion models. The TSDM uses an\nimproved U-net architecture with attention block, ResBlock and TimeEmbedding to\neffectively segment and extract features from one-dimensional time series data.\nIt operates based on forward diffusion and reverse denoising processes for\ntime-series generation. Experimental validation is conducted using\nsingle-frequency, multi-frequency datasets, and bearing fault datasets. The\nresults show that TSDM can accurately generate the single-frequency and\nmulti-frequency features in the time series and retain the basic frequency\nfeatures for the diffusion generation results of the bearing fault series. It\nis also found that the original DDPM could not generate high quality vibration\nsignals, but the improved U-net in TSDM, which applied the combination of\nattention block and ResBlock, could effectively improve the quality of\nvibration signal generation. Finally, TSDM is applied to the small sample fault\ndiagnosis of three public bearing fault datasets, and the results show that the\naccuracy of small sample fault diagnosis of the three datasets is improved by\n32.380%, 18.355% and 9.298% at most, respectively.\n","authors":["Haiming Yi","Lei Hou","Yuhong Jin","Nasser A. Saeed","Ali Kandil","Hao Duan"],"pdf_url":"https://arxiv.org/pdf/2312.07981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13200v3","updated":"2024-06-30T04:40:35Z","published":"2024-01-24T03:03:17Z","title":"Topology-aware Embedding Memory for Continual Learning on Expanding\n Networks","summary":" Memory replay based techniques have shown great success for continual\nlearning with incrementally accumulated Euclidean data. Directly applying them\nto continually expanding networks, however, leads to the potential memory\nexplosion problem due to the need to buffer representative nodes and their\nassociated topological neighborhood structures. To this end, we systematically\nanalyze the key challenges in the memory explosion problem, and present a\ngeneral framework, \\textit{i.e.}, Parameter Decoupled Graph Neural Networks\n(PDGNNs) with Topology-aware Embedding Memory (TEM), to tackle this issue. The\nproposed framework not only reduces the memory space complexity from\n$\\mathcal{O}(nd^L)$ to $\\mathcal{O}(n)$~\\footnote{$n$: memory budget, $d$:\naverage node degree, $L$: the radius of the GNN receptive field}, but also\nfully utilizes the topological information for memory replay. Specifically,\nPDGNNs decouple trainable parameters from the computation ego-subnetwork via\n\\textit{Topology-aware Embeddings} (TEs), which compress ego-subnetworks into\ncompact vectors (\\textit{i.e.}, TEs) to reduce the memory consumption. Based on\nthis framework, we discover a unique \\textit{pseudo-training effect} in\ncontinual learning on expanding networks and this effect motivates us to\ndevelop a novel \\textit{coverage maximization sampling} strategy that can\nenhance the performance with a tight memory budget. Thorough empirical studies\ndemonstrate that, by tackling the memory explosion problem and incorporating\ntopological information into memory replay, PDGNNs with TEM significantly\noutperform state-of-the-art techniques, especially in the challenging\nclass-incremental setting.\n","authors":["Xikun Zhang","Dongjin Song","Yixin Chen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2401.13200v3.pdf","comment":"This paper has been accepted by KDD 2024"},{"id":"http://arxiv.org/abs/2406.16255v2","updated":"2024-06-30T03:55:48Z","published":"2024-06-24T01:37:18Z","title":"Uncertainty-Aware Reward-Free Exploration with General Function\n Approximation","summary":" Mastering multiple tasks through exploration and learning in an environment\nposes a significant challenge in reinforcement learning (RL). Unsupervised RL\nhas been introduced to address this challenge by training policies with\nintrinsic rewards rather than extrinsic rewards. However, current intrinsic\nreward designs and unsupervised RL algorithms often overlook the heterogeneous\nnature of collected samples, thereby diminishing their sample efficiency. To\novercome this limitation, in this paper, we propose a reward-free RL algorithm\ncalled \\alg. The key idea behind our algorithm is an uncertainty-aware\nintrinsic reward for exploring the environment and an uncertainty-weighted\nlearning process to handle heterogeneous uncertainty in different samples.\nTheoretically, we show that in order to find an $\\epsilon$-optimal policy,\nGFA-RFE needs to collect $\\tilde{O} (H^2 \\log N_{\\mathcal F} (\\epsilon)\n\\mathrm{dim} (\\mathcal F) / \\epsilon^2 )$ number of episodes, where $\\mathcal\nF$ is the value function class with covering number $N_{\\mathcal F} (\\epsilon)$\nand generalized eluder dimension $\\mathrm{dim} (\\mathcal F)$. Such a result\noutperforms all existing reward-free RL algorithms. We further implement and\nevaluate GFA-RFE across various domains and tasks in the DeepMind Control\nSuite. Experiment results show that GFA-RFE outperforms or is comparable to the\nperformance of state-of-the-art unsupervised RL algorithms.\n","authors":["Junkai Zhang","Weitong Zhang","Dongruo Zhou","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2406.16255v2.pdf","comment":"32 pages, 5 figures, 4 tables, accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2404.14651v2","updated":"2024-06-30T03:50:46Z","published":"2024-04-23T01:18:28Z","title":"Forecasting the Forced van der Pol Equation with Frequent Phase Shifts\n Using Reservoir Computing","summary":" We tested the performance of reservoir computing (RC) in predicting the\ndynamics of a certain non-autonomous dynamical system. Specifically, we\nconsidered a van del Pol oscillator subjected to periodic external force with\nfrequent phase shifts. The reservoir computer, which was trained and optimized\nwith simulation data generated for a particular phase shift, was designed to\npredict the oscillation dynamics under periodic external forces with different\nphase shifts. The results suggest that if the training data have some\ncomplexity, it is possible to quantitatively predict the oscillation dynamics\nexposed to different phase shifts. The setting of this study was motivated by\nthe problem of predicting the state of the circadian rhythm of shift workers\nand designing a better shift work schedule for each individual. Our results\nsuggest that RC could be exploited for such applications.\n","authors":["Sho Kuno","Hiroshi Kori"],"pdf_url":"https://arxiv.org/pdf/2404.14651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16789v2","updated":"2024-06-30T02:19:00Z","published":"2024-04-25T17:38:57Z","title":"Continual Learning of Large Language Models: A Comprehensive Survey","summary":" The recent success of large language models (LLMs) trained on static,\npre-collected, general datasets has sparked numerous research directions and\napplications. One such direction addresses the non-trivial challenge of\nintegrating pre-trained LLMs into dynamic data distributions, task structures,\nand user preferences. Pre-trained LLMs, when tailored for specific needs, often\nexperience significant performance degradation in previous knowledge domains --\na phenomenon known as \"catastrophic forgetting\". While extensively studied in\nthe continual learning (CL) community, it presents new manifestations in the\nrealm of LLMs. In this survey, we provide a comprehensive overview of the\ncurrent research progress on LLMs within the context of CL. This survey is\nstructured into four main sections: we first describe an overview of\ncontinually learning LLMs, consisting of two directions of continuity: vertical\ncontinuity (or vertical continual learning), i.e., continual adaptation from\ngeneral to specific capabilities, and horizontal continuity (or horizontal\ncontinual learning), i.e., continual adaptation across time and domains\n(Section 3). We then summarize three stages of learning LLMs in the context of\nmodern CL: Continual Pre-Training (CPT), Domain-Adaptive Pre-training (DAP),\nand Continual Fine-Tuning (CFT) (Section 4). Then we provide an overview of\nevaluation protocols for continual learning with LLMs, along with the current\navailable data sources (Section 5). Finally, we discuss intriguing questions\npertaining to continual learning for LLMs (Section 6). The full list of papers\nexamined in this survey is available at\nhttps://github.com/Wang-ML-Lab/llm-continual-learning-survey.\n","authors":["Haizhou Shi","Zihao Xu","Hengyi Wang","Weiyi Qin","Wenyuan Wang","Yibin Wang","Zifeng Wang","Sayna Ebrahimi","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.16789v2.pdf","comment":"47 pages, 2 figures, 4 tables. Work in progress"},{"id":"http://arxiv.org/abs/2406.11779v6","updated":"2024-06-30T02:00:46Z","published":"2024-06-17T17:34:25Z","title":"Compact Proofs of Model Performance via Mechanistic Interpretability","summary":" In this work, we propose using mechanistic interpretability -- techniques for\nreverse engineering model weights into human-interpretable algorithms -- to\nderive and compactly prove formal guarantees on model performance. We prototype\nthis approach by formally proving lower bounds on the accuracy of 151 small\ntransformers trained on a Max-of-$K$ task. We create 102 different\ncomputer-assisted proof strategies and assess their length and tightness of\nbound on each of our models. Using quantitative metrics, we find that shorter\nproofs seem to require and provide more mechanistic understanding. Moreover, we\nfind that more faithful mechanistic understanding leads to tighter performance\nbounds. We confirm these connections by qualitatively examining a subset of our\nproofs. Finally, we identify compounding structureless noise as a key challenge\nfor using mechanistic interpretability to generate compact proofs on model\nperformance.\n","authors":["Jason Gross","Rajashree Agrawal","Thomas Kwa","Euan Ong","Chun Hei Yip","Alex Gibson","Soufiane Noubir","Lawrence Chan"],"pdf_url":"https://arxiv.org/pdf/2406.11779v6.pdf","comment":"accepted to 2024 ICML MI Workshop (Spotlight)"},{"id":"http://arxiv.org/abs/2403.17852v2","updated":"2024-06-30T01:51:00Z","published":"2024-03-26T16:40:08Z","title":"Counterfactual Fairness through Transforming Data Orthogonal to Bias","summary":" Machine learning models have shown exceptional prowess in solving complex\nissues across various domains. However, these models can sometimes exhibit\nbiased decision-making, resulting in unequal treatment of different groups.\nDespite substantial research on counterfactual fairness, methods to reduce the\nimpact of multivariate and continuous sensitive variables on decision-making\noutcomes are still underdeveloped. We propose a novel data pre-processing\nalgorithm, Orthogonal to Bias (OB), which is designed to eliminate the\ninfluence of a group of continuous sensitive variables, thus promoting\ncounterfactual fairness in machine learning applications. Our approach, based\non the assumption of a jointly normal distribution within a structural causal\nmodel (SCM), demonstrates that counterfactual fairness can be achieved by\nensuring the data is orthogonal to the observed sensitive variables. The OB\nalgorithm is model-agnostic, making it applicable to a wide range of machine\nlearning models and tasks. Additionally, it includes a sparse variant to\nimprove numerical stability through regularization. Empirical evaluations on\nboth simulated and real-world datasets, encompassing settings with both\ndiscrete and continuous sensitive variables, show that our methodology\neffectively promotes fairer outcomes without compromising accuracy.\n","authors":["Shuyi Chen","Shixiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.17852v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13173v2","updated":"2024-06-30T01:22:09Z","published":"2024-06-19T03:07:33Z","title":"Biomedical Visual Instruction Tuning with Clinician Preference Alignment","summary":" Recent advancements in multimodal foundation models have showcased impressive\ncapabilities in understanding and reasoning with visual and textual\ninformation. Adapting these foundation models trained for general usage to\nspecialized domains like biomedicine requires large-scale domain-specific\ninstruction datasets. While existing works have explored curating such datasets\nautomatically, the resultant datasets are not explicitly aligned with domain\nexpertise. In this work, we propose a data-centric framework, Biomedical Visual\nInstruction Tuning with Clinician Preference Alignment (BioMed-VITAL), that\nincorporates clinician preferences into both stages of generating and selecting\ninstruction data for tuning biomedical multimodal foundation models. First,\nduring the generation stage, we prompt the GPT-4V generator with a diverse set\nof clinician-selected demonstrations for preference-aligned data candidate\ngeneration. Then, during the selection phase, we train a separate selection\nmodel, which explicitly distills clinician and policy-guided model preferences\ninto a rating function to select high-quality data for medical instruction\ntuning. Results show that the model tuned with the instruction-following data\nfrom our method demonstrates a significant improvement in open visual chat\n(18.5% relatively) and medical VQA (win rate up to 81.73%). Our\ninstruction-following data and models are available at BioMed-VITAL.github.io.\n","authors":["Hejie Cui","Lingjun Mao","Xin Liang","Jieyu Zhang","Hui Ren","Quanzheng Li","Xiang Li","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2406.13173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11917v3","updated":"2024-06-30T00:52:49Z","published":"2024-02-19T08:04:25Z","title":"A Mechanistic Analysis of a Transformer Trained on a Symbolic Multi-Step\n Reasoning Task","summary":" Transformers demonstrate impressive performance on a range of reasoning\nbenchmarks. To evaluate the degree to which these abilities are a result of\nactual reasoning, existing work has focused on developing sophisticated\nbenchmarks for behavioral studies. However, these studies do not provide\ninsights into the internal mechanisms driving the observed capabilities. To\nimprove our understanding of the internal mechanisms of transformers, we\npresent a comprehensive mechanistic analysis of a transformer trained on a\nsynthetic reasoning task. We identify a set of interpretable mechanisms the\nmodel uses to solve the task, and validate our findings using correlational and\ncausal evidence. Our results suggest that it implements a depth-bounded\nrecurrent mechanisms that operates in parallel and stores intermediate results\nin selected token positions. We anticipate that the motifs we identified in our\nsynthetic setting can provide valuable insights into the broader operating\nprinciples of transformers and thus provide a basis for understanding more\ncomplex models.\n","authors":["Jannik Brinkmann","Abhay Sheshadri","Victor Levoso","Paul Swoboda","Christian Bartelt"],"pdf_url":"https://arxiv.org/pdf/2402.11917v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.07588v2","updated":"2024-06-30T18:19:25Z","published":"2024-06-11T08:12:43Z","title":"AIM: Let Any Multi-modal Large Language Models Embrace Efficient\n In-Context Learning","summary":" In-context learning (ICL) facilitates Large Language Models (LLMs) exhibiting\nemergent ability on downstream tasks without updating billions of parameters.\nHowever, in the area of multi-modal Large Language Models (MLLMs), two problems\nhinder the application of multi-modal ICL: (1) Most primary MLLMs are only\ntrained on single-image datasets, making them unable to read multi-modal\ndemonstrations. (2) With the demonstrations increasing, thousands of visual\ntokens highly challenge hardware and degrade ICL performance. During\npreliminary explorations, we discovered that the inner LLM tends to focus more\non the linguistic modality within multi-modal demonstrations to generate\nresponses. Therefore, we propose a general and light-weighted framework\n\\textbf{AIM} to tackle the mentioned problems through \\textbf{A}ggregating\n\\textbf{I}mage information of \\textbf{M}ultimodal demonstrations to the dense\nlatent space of the corresponding linguistic part. Specifically, AIM first uses\nthe frozen backbone MLLM to read each image-text demonstration and extracts the\nvector representations on top of the text. These vectors naturally fuse the\ninformation of the image-text pair, and AIM transforms them into fused virtual\ntokens acceptable for the inner LLM via a trainable projection layer.\nUltimately, these fused tokens function as variants of multi-modal\ndemonstrations, fed into the MLLM to direct its response to the current query\nas usual. Because these fused tokens stem from the textual component of the\nimage-text pair, a multi-modal demonstration is nearly reduced to a pure\ntextual demonstration, thus seamlessly applying to any MLLMs. With its de facto\nMLLM frozen, AIM is parameter-efficient and we train it on public multi-modal\nweb corpora which have nothing to do with downstream test tasks.\n","authors":["Jun Gao","Qian Qiao","Ziqiang Cao","Zili Wang","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2406.07588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05317v5","updated":"2024-06-30T15:03:42Z","published":"2024-04-08T09:08:43Z","title":"WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A\n Conceptual Architecture","summary":" This work proposes a WebXR-based cross-platform conceptual architecture,\nleveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate\nthe development of an open, accessible, and interoperable metaverse. By\nintroducing the concept of spatial web app, this research contributes to the\ndiscourse on the metaverse, offering an architecture that democratizes access\nto virtual environments and extended reality through the web, and aligns with\nTim Berners-Lee's original vision of the World Wide Web as an open platform in\nthe digital realm.\n","authors":["Giuseppe Macario"],"pdf_url":"https://arxiv.org/pdf/2404.05317v5.pdf","comment":"draftcls option"},{"id":"http://arxiv.org/abs/2407.00556v1","updated":"2024-06-30T01:18:37Z","published":"2024-06-30T01:18:37Z","title":"Revisiting Vision-Language Features Adaptation and Inconsistency for\n Social Media Popularity Prediction","summary":" Social media popularity (SMP) prediction is a complex task involving\nmulti-modal data integration. While pre-trained vision-language models (VLMs)\nlike CLIP have been widely adopted for this task, their effectiveness in\ncapturing the unique characteristics of social media content remains\nunexplored. This paper critically examines the applicability of CLIP-based\nfeatures in SMP prediction, focusing on the overlooked phenomenon of semantic\ninconsistency between images and text in social media posts. Through extensive\nanalysis, we demonstrate that this inconsistency increases with post\npopularity, challenging the conventional use of VLM features. We provide a\ncomprehensive investigation of semantic inconsistency across different\npopularity intervals and analyze the impact of VLM feature adaptation on SMP\ntasks. Our experiments reveal that incorporating inconsistency measures and\nadapted text features significantly improves model performance, achieving an\nSRC of 0.729 and an MAE of 1.227. These findings not only enhance SMP\nprediction accuracy but also provide crucial insights for developing more\ntargeted approaches in social media analysis.\n","authors":["Chih-Chung Hsu","Chia-Ming Lee","Yu-Fan Lin","Yi-Shiuan Chou","Chih-Yu Jian","Chi-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2407.00556v1.pdf","comment":"Submission of the 7th Social Media Prediction Challenge"}]},"2024-07-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.02490v1","updated":"2024-07-02T17:59:56Z","published":"2024-07-02T17:59:56Z","title":"MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via\n Dynamic Sparse Attention","summary":" The computational challenges of Large Language Model (LLM) inference remain a\nsignificant barrier to their widespread deployment, especially as prompt\nlengths continue to increase. Due to the quadratic complexity of the attention\ncomputation, it takes 30 minutes for an 8B LLM to process a prompt of 1M tokens\n(i.e., the pre-filling stage) on a single A100 GPU. Existing methods for\nspeeding up prefilling often fail to maintain acceptable accuracy or efficiency\nwhen applied to long-context LLMs. To address this gap, we introduce MInference\n(Milliontokens Inference), a sparse calculation method designed to accelerate\npre-filling of long-sequence processing. Specifically, we identify three unique\npatterns in long-context attention matrices-the A-shape, Vertical-Slash, and\nBlock-Sparsethat can be leveraged for efficient sparse computation on GPUs. We\ndetermine the optimal pattern for each attention head offline and dynamically\nbuild sparse indices based on the assigned pattern during inference. With the\npattern and sparse indices, we perform efficient sparse attention calculations\nvia our optimized GPU kernels to significantly reduce the latency in the\npre-filling stage of long-context LLMs. Our proposed technique can be directly\napplied to existing LLMs without any modifications to the pre-training setup or\nadditional fine-tuning. By evaluating on a wide range of downstream tasks,\nincluding InfiniteBench, RULER, PG-19, and Needle In A Haystack, and models\nincluding LLaMA-3-1M, GLM4-1M, Yi-200K, Phi-3-128K, and Qwen2-128K, we\ndemonstrate that MInference effectively reduces inference latency by up to 10x\nfor pre-filling on an A100, while maintaining accuracy. Our code is available\nat https://aka.ms/MInference.\n","authors":["Huiqiang Jiang","Yucheng Li","Chengruidong Zhang","Qianhui Wu","Xufang Luo","Surin Ahn","Zhenhua Han","Amir H. Abdi","Dongsheng Li","Chin-Yew Lin","Yuqing Yang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2407.02490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02486v1","updated":"2024-07-02T17:59:29Z","published":"2024-07-02T17:59:29Z","title":"Neurocache: Efficient Vector Retrieval for Long-range Language Modeling","summary":" This paper introduces Neurocache, an approach to extend the effective context\nsize of large language models (LLMs) using an external vector cache to store\nits past states. Like recent vector retrieval approaches, Neurocache uses an\nefficient k-nearest-neighbor (kNN) algorithm to retrieve relevant past states\nand incorporate them into the attention process. Neurocache improves upon\nprevious methods by (1) storing compressed states, which reduces cache size;\n(2) performing a single retrieval operation per token which increases inference\nspeed; and (3) extending the retrieval window to neighboring states, which\nimproves both language modeling and downstream task accuracy. Our experiments\nshow the effectiveness of Neurocache both for models trained from scratch and\nfor pre-trained models such as Llama2-7B and Mistral-7B when enhanced with the\ncache mechanism. We also compare Neurocache with text retrieval methods and\nshow improvements in single-document question-answering and few-shot learning\ntasks. We made the source code available under:\nhttps://github.com/alisafaya/neurocache\n","authors":["Ali Safaya","Deniz Yuret"],"pdf_url":"https://arxiv.org/pdf/2407.02486v1.pdf","comment":"Long paper, published at the main conference NAACL'24"},{"id":"http://arxiv.org/abs/2407.02485v1","updated":"2024-07-02T17:59:17Z","published":"2024-07-02T17:59:17Z","title":"RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in\n LLMs","summary":" Large language models (LLMs) typically utilize the top-k contexts from a\nretriever in retrieval-augmented generation (RAG). In this work, we propose a\nnovel instruction fine-tuning framework RankRAG, which instruction-tunes a\nsingle LLM for the dual purpose of context ranking and answer generation in\nRAG. In particular, the instruction-tuned LLMs work surprisingly well by adding\na small fraction of ranking data into the training blend, and outperform\nexisting expert ranking models, including the same LLM exclusively fine-tuned\non a large amount of ranking data. For generation, we compare our model with\nmany strong baselines, including GPT-4-0613, GPT-4-turbo-2024-0409, and\nChatQA-1.5, an open-sourced model with the state-of-the-art performance on RAG\nbenchmarks. Specifically, our Llama3-RankRAG significantly outperforms\nLlama3-ChatQA-1.5 and GPT-4 models on nine knowledge-intensive benchmarks. In\naddition, it also performs comparably to GPT-4 on five RAG benchmarks in the\nbiomedical domain without instruction fine-tuning on biomedical data,\ndemonstrating its superb capability for generalization to new domains.\n","authors":["Yue Yu","Wei Ping","Zihan Liu","Boxin Wang","Jiaxuan You","Chao Zhang","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2407.02485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02483v1","updated":"2024-07-02T17:58:23Z","published":"2024-07-02T17:58:23Z","title":"MMedAgent: Learning to Use Medical Tools with Multi-modal Agent","summary":" Multi-Modal Large Language Models (MLLMs), despite being successful, exhibit\nlimited generality and often fall short when compared to specialized models.\nRecently, LLM-based agents have been developed to address these challenges by\nselecting appropriate specialized models as tools based on user inputs.\nHowever, such advancements have not been extensively explored within the\nmedical domain. To bridge this gap, this paper introduces the first agent\nexplicitly designed for the medical field, named \\textbf{M}ulti-modal\n\\textbf{Med}ical \\textbf{Agent} (MMedAgent). We curate an instruction-tuning\ndataset comprising six medical tools solving seven tasks, enabling the agent to\nchoose the most suitable tools for a given task. Comprehensive experiments\ndemonstrate that MMedAgent achieves superior performance across a variety of\nmedical tasks compared to state-of-the-art open-source methods and even the\nclosed-source model, GPT-4o. Furthermore, MMedAgent exhibits efficiency in\nupdating and integrating new medical tools.\n","authors":["Binxu Li","Tiankai Yan","Yuanting Pan","Zhe Xu","Jie Luo","Ruiyang Ji","Shilong Liu","Haoyu Dong","Zihao Lin","Yixin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12327v2","updated":"2024-07-02T17:56:09Z","published":"2024-02-19T18:00:53Z","title":"Shall We Team Up: Exploring Spontaneous Cooperation of Competing LLM\n Agents","summary":" Large Language Models (LLMs) have increasingly been utilized in social\nsimulations, where they are often guided by carefully crafted instructions to\nstably exhibit human-like behaviors during simulations. Nevertheless, we doubt\nthe necessity of shaping agents' behaviors for accurate social simulations.\nInstead, this paper emphasizes the importance of spontaneous phenomena, wherein\nagents deeply engage in contexts and make adaptive decisions without explicit\ndirections. We explored spontaneous cooperation across three competitive\nscenarios and successfully simulated the gradual emergence of cooperation,\nfindings that align closely with human behavioral data. This approach not only\naids the computational social science community in bridging the gap between\nsimulations and real-world dynamics but also offers the AI community a novel\nmethod to assess LLMs' capability of deliberate reasoning.\n","authors":["Zengqing Wu","Run Peng","Shuyuan Zheng","Qianying Liu","Xu Han","Brian Inhyuk Kwon","Makoto Onizuka","Shaojie Tang","Chuan Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.12327v2.pdf","comment":"Source codes available at\n https://github.com/wuzengqing001225/SABM_ShallWeTeamUp"},{"id":"http://arxiv.org/abs/2407.02477v1","updated":"2024-07-02T17:55:03Z","published":"2024-07-02T17:55:03Z","title":"Understanding Alignment in Multimodal LLMs: A Comprehensive Study","summary":" Preference alignment has become a crucial component in enhancing the\nperformance of Large Language Models (LLMs), yet its impact in Multimodal Large\nLanguage Models (MLLMs) remains comparatively underexplored. Similar to\nlanguage models, MLLMs for image understanding tasks encounter challenges like\nhallucination. In MLLMs, hallucination can occur not only by stating incorrect\nfacts but also by producing responses that are inconsistent with the image\ncontent. A primary objective of alignment for MLLMs is to encourage these\nmodels to align responses more closely with image information. Recently,\nmultiple works have introduced preference datasets for MLLMs and examined\ndifferent alignment methods, including Direct Preference Optimization (DPO) and\nProximal Policy Optimization (PPO). However, due to variations in datasets,\nbase model types, and alignment methods, it remains unclear which specific\nelements contribute most significantly to the reported improvements in these\nworks. In this paper, we independently analyze each aspect of preference\nalignment in MLLMs. We start by categorizing the alignment algorithms into two\ngroups, offline (such as DPO), and online (such as online-DPO), and show that\ncombining offline and online methods can improve the performance of the model\nin certain scenarios. We review a variety of published multimodal preference\ndatasets and discuss how the details of their construction impact model\nperformance. Based on these insights, we introduce a novel way of creating\nmultimodal preference data called Bias-Driven Hallucination Sampling (BDHS)\nthat needs neither additional annotation nor external models, and show that it\ncan achieve competitive performance to previously published alignment work for\nmultimodal models across a range of benchmarks.\n","authors":["Elmira Amirloo","Jean-Philippe Fauconnier","Christoph Roesmann","Christian Kerl","Rinu Boney","Yusu Qian","Zirui Wang","Afshin Dehghan","Yinfei Yang","Zhe Gan","Peter Grasch"],"pdf_url":"https://arxiv.org/pdf/2407.02477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02472v1","updated":"2024-07-02T17:51:27Z","published":"2024-07-02T17:51:27Z","title":"ValueScope: Unveiling Implicit Norms and Values via Return Potential\n Model of Social Interactions","summary":" This study introduces ValueScope, a framework leveraging language models to\nquantify social norms and values within online communities, grounded in social\nscience perspectives on normative structures. We employ ValueScope to dissect\nand analyze linguistic and stylistic expressions across 13 Reddit communities\ncategorized under gender, politics, science, and finance. Our analysis provides\na quantitative foundation showing that even closely related communities exhibit\nremarkably diverse norms. This diversity supports existing theories and adds a\nnew dimension--community preference--to understanding community interactions.\nValueScope not only delineates differing social norms among communities but\nalso effectively traces their evolution and the influence of significant\nexternal events like the U.S. presidential elections and the emergence of new\nsub-communities. The framework thus highlights the pivotal role of social norms\nin shaping online interactions, presenting a substantial advance in both the\ntheory and application of social norm studies in digital spaces.\n","authors":["Chan Young Park","Shuyue Stella Li","Hayoung Jung","Svitlana Volkova","Tanushree Mitra","David Jurgens","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2407.02472v1.pdf","comment":"First three authors contributed equally. 33 pages. In submission"},{"id":"http://arxiv.org/abs/2402.17019v4","updated":"2024-07-02T17:50:42Z","published":"2024-02-26T20:56:06Z","title":"Leveraging Large Language Models for Learning Complex Legal Concepts\n through Storytelling","summary":" Making legal knowledge accessible to non-experts is crucial for enhancing\ngeneral legal literacy and encouraging civic participation in democracy.\nHowever, legal documents are often challenging to understand for people without\nlegal backgrounds. In this paper, we present a novel application of large\nlanguage models (LLMs) in legal education to help non-experts learn intricate\nlegal concepts through storytelling, an effective pedagogical tool in conveying\ncomplex and abstract concepts. We also introduce a new dataset LegalStories,\nwhich consists of 294 complex legal doctrines, each accompanied by a story and\na set of multiple-choice questions generated by LLMs. To construct the dataset,\nwe experiment with various LLMs to generate legal stories explaining these\nconcepts. Furthermore, we use an expert-in-the-loop approach to iteratively\ndesign multiple-choice questions. Then, we evaluate the effectiveness of\nstorytelling with LLMs through randomized controlled trials (RCTs) with legal\nnovices on 10 samples from the dataset. We find that LLM-generated stories\nenhance comprehension of legal concepts and interest in law among non-native\nspeakers compared to only definitions. Moreover, stories consistently help\nparticipants relate legal concepts to their lives. Finally, we find that\nlearning with stories shows a higher retention rate for non-native speakers in\nthe follow-up assessment. Our work has strong implications for using LLMs in\npromoting teaching and learning in the legal field and beyond.\n","authors":["Hang Jiang","Xiajie Zhang","Robert Mahari","Daniel Kessler","Eric Ma","Tal August","Irene Li","Alex 'Sandy' Pentland","Yoon Kim","Deb Roy","Jad Kabbara"],"pdf_url":"https://arxiv.org/pdf/2402.17019v4.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2407.02448v1","updated":"2024-07-02T17:26:26Z","published":"2024-07-02T17:26:26Z","title":"Ensemble of pre-trained language models and data augmentation for hate\n speech detection from Arabic tweets","summary":" Today, hate speech classification from Arabic tweets has drawn the attention\nof several researchers. Many systems and techniques have been developed to\nresolve this classification task. Nevertheless, two of the major challenges\nfaced in this context are the limited performance and the problem of imbalanced\ndata. In this study, we propose a novel approach that leverages ensemble\nlearning and semi-supervised learning based on previously manually labeled. We\nconducted experiments on a benchmark dataset by classifying Arabic tweets into\n5 distinct classes: non-hate, general hate, racial, religious, or sexism.\nExperimental results show that: (1) ensemble learning based on pre-trained\nlanguage models outperforms existing related works; (2) Our proposed data\naugmentation improves the accuracy results of hate speech detection from Arabic\ntweets and outperforms existing related works. Our main contribution is the\nachievement of encouraging results in Arabic hate speech detection.\n","authors":["Kheir Eddine Daouadi","Yaakoub Boualleg","Kheir Eddine Haouaouchi"],"pdf_url":"https://arxiv.org/pdf/2407.02448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00837v2","updated":"2024-07-02T17:23:44Z","published":"2024-06-30T21:40:26Z","title":"Towards Robust Speech Representation Learning for Thousands of Languages","summary":" Self-supervised learning (SSL) has helped extend speech technologies to more\nlanguages by reducing the need for labeled data. However, models are still far\nfrom supporting the world's 7000+ languages. We propose XEUS, a Cross-lingual\nEncoder for Universal Speech, trained on over 1 million hours of data across\n4057 languages, extending the language coverage of SSL models 4-fold. We\ncombine 1 million hours of speech from existing publicly accessible corpora\nwith a newly created corpus of 7400+ hours from 4057 languages, which will be\npublicly released. To handle the diverse conditions of multilingual speech\ndata, we augment the typical SSL masked prediction approach with a novel\ndereverberation objective, increasing robustness. We evaluate XEUS on several\nbenchmarks, and show that it consistently outperforms or achieves comparable\nresults to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS\nsets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT\n2.0 v2 by 0.8% and 4.4% respectively, despite having less parameters or\npre-training data. Checkpoints, code, and data are found in\nhttps://www.wavlab.org/activities/2024/xeus/.\n","authors":["William Chen","Wangyou Zhang","Yifan Peng","Xinjian Li","Jinchuan Tian","Jiatong Shi","Xuankai Chang","Soumi Maiti","Karen Livescu","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.00837v2.pdf","comment":"Updated affiliations; 20 pages"},{"id":"http://arxiv.org/abs/2407.02446v1","updated":"2024-07-02T17:22:54Z","published":"2024-07-02T17:22:54Z","title":"Predicting vs. Acting: A Trade-off Between World Modeling & Agent\n Modeling","summary":" RLHF-aligned LMs have shown unprecedented ability on both benchmarks and\nlong-form text generation, yet they struggle with one foundational task:\nnext-token prediction. As RLHF models become agent models aimed at interacting\nwith humans, they seem to lose their world modeling -- the ability to predict\nwhat comes next in arbitrary documents, which is the foundational training\nobjective of the Base LMs that RLHF adapts.\n Besides empirically demonstrating this trade-off, we propose a potential\nexplanation: to perform coherent long-form generation, RLHF models restrict\nrandomness via implicit blueprints. In particular, RLHF models concentrate\nprobability on sets of anchor spans that co-occur across multiple generations\nfor the same prompt, serving as textual scaffolding but also limiting a model's\nability to generate documents that do not include these spans. We study this\ntrade-off on the most effective current agent models, those aligned with RLHF,\nwhile exploring why this may remain a fundamental trade-off between models that\nact and those that predict, even as alignment techniques improve.\n","authors":["Margaret Li","Weijia Shi","Artidoro Pagnoni","Peter West","Ari Holtzman"],"pdf_url":"https://arxiv.org/pdf/2407.02446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02432v1","updated":"2024-07-02T17:09:24Z","published":"2024-07-02T17:09:24Z","title":"Evaluating the Robustness of Adverse Drug Event Classification Models\n Using Templates","summary":" An adverse drug effect (ADE) is any harmful event resulting from medical drug\ntreatment. Despite their importance, ADEs are often under-reported in official\nchannels. Some research has therefore turned to detecting discussions of ADEs\nin social media. Impressive results have been achieved in various attempts to\ndetect ADEs. In a high-stakes domain such as medicine, however, an in-depth\nevaluation of a model's abilities is crucial. We address the issue of thorough\nperformance evaluation in English-language ADE detection with hand-crafted\ntemplates for four capabilities: Temporal order, negation, sentiment, and\nbeneficial effect. We find that models with similar performance on held-out\ntest sets have varying results on these capabilities.\n","authors":["Dorothea MacPhail","David Harbecke","Lisa Raithel","Sebastian Möller"],"pdf_url":"https://arxiv.org/pdf/2407.02432v1.pdf","comment":"Accepted at BioNLP 2024 and Shared Tasks (ACL Workshop)"},{"id":"http://arxiv.org/abs/2405.09395v2","updated":"2024-07-02T16:42:48Z","published":"2024-05-15T14:50:51Z","title":"Matching domain experts by training from scratch on domain knowledge","summary":" Recently, large language models (LLMs) have outperformed human experts in\npredicting the results of neuroscience experiments (Luo et al., 2024). What is\nthe basis for this performance? One possibility is that statistical patterns in\nthat specific scientific literature, as opposed to emergent reasoning abilities\narising from broader training, underlie LLMs' performance. To evaluate this\npossibility, we trained (next word prediction) a relatively small\n124M-parameter GPT-2 model on 1.3 billion tokens of domain-specific knowledge.\nDespite being orders of magnitude smaller than larger LLMs trained on trillions\nof tokens, small models achieved expert-level performance in predicting\nneuroscience results. Small models trained on the neuroscience literature\nsucceeded when they were trained from scratch using a tokenizer specifically\ntrained on neuroscience text or when the neuroscience literature was used to\nfinetune a pretrained GPT-2. Our results indicate that expert-level performance\nmay be attained by even small LLMs through domain-specific, auto-regressive\ntraining approaches.\n","authors":["Xiaoliang Luo","Guangzhi Sun","Bradley C. Love"],"pdf_url":"https://arxiv.org/pdf/2405.09395v2.pdf","comment":"ICML 2024 (Large Language Models and Cognition)"},{"id":"http://arxiv.org/abs/2407.02408v1","updated":"2024-07-02T16:31:37Z","published":"2024-07-02T16:31:37Z","title":"CEB: Compositional Evaluation Benchmark for Fairness in Large Language\n Models","summary":" As Large Language Models (LLMs) are increasingly deployed to handle various\nnatural language processing (NLP) tasks, concerns regarding the potential\nnegative societal impacts of LLM-generated content have also arisen. To\nevaluate the biases exhibited by LLMs, researchers have recently proposed a\nvariety of datasets. However, existing bias evaluation efforts often focus on\nonly a particular type of bias and employ inconsistent evaluation metrics,\nleading to difficulties in comparison across different datasets and LLMs. To\naddress these limitations, we collect a variety of datasets designed for the\nbias evaluation of LLMs, and further propose CEB, a Compositional Evaluation\nBenchmark that covers different types of bias across different social groups\nand tasks. The curation of CEB is based on our newly proposed compositional\ntaxonomy, which characterizes each dataset from three dimensions: bias types,\nsocial groups, and tasks. By combining the three dimensions, we develop a\ncomprehensive evaluation strategy for the bias in LLMs. Our experiments\ndemonstrate that the levels of bias vary across these dimensions, thereby\nproviding guidance for the development of specific bias mitigation methods.\n","authors":["Song Wang","Peng Wang","Tong Zhou","Yushun Dong","Zhen Tan","Jundong Li"],"pdf_url":"https://arxiv.org/pdf/2407.02408v1.pdf","comment":"37 pages, 32 figures"},{"id":"http://arxiv.org/abs/2407.02397v1","updated":"2024-07-02T16:15:01Z","published":"2024-07-02T16:15:01Z","title":"Learning to Refine with Fine-Grained Natural Language Feedback","summary":" Recent work has explored the capability of large language models (LLMs) to\nidentify and correct errors in LLM-generated responses. These refinement\napproaches frequently evaluate what sizes of models are able to do refinement\nfor what problems, but less attention is paid to what effective feedback for\nrefinement looks like. In this work, we propose looking at refinement with\nfeedback as a composition of three distinct LLM competencies: (1)\nidentification of bad generations; (2) fine-grained natural language feedback\ngeneration; (3) refining with fine-grained feedback. The first step can be\nimplemented with a high-performing discriminative model and steps 2 and 3 can\nbe implemented either via prompted or fine-tuned LLMs. A key property of this\napproach is that the step 2 critique model can give fine-grained feedback about\nerrors, made possible by offloading the discrimination to a separate model in\nstep 1. We show that models of different capabilities benefit from refining\nwith this approach on the task of improving factual consistency of document\ngrounded summaries. Overall, our proposed method consistently outperforms\nexisting end-to-end refinement approaches and current trained models not\nfine-tuned for factuality critiquing.\n","authors":["Manya Wadhwa","Xinyu Zhao","Junyi Jessy Li","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2407.02397v1.pdf","comment":"Code and models available at: https://github.com/ManyaWadhwa/DCR"},{"id":"http://arxiv.org/abs/2407.02395v1","updated":"2024-07-02T16:13:21Z","published":"2024-07-02T16:13:21Z","title":"Is Your AI-Generated Code Really Secure? Evaluating Large Language\n Models on Secure Code Generation with CodeSecEval","summary":" Large language models (LLMs) have brought significant advancements to code\ngeneration and code repair, benefiting both novice and experienced developers.\nHowever, their training using unsanitized data from open-source repositories,\nlike GitHub, raises the risk of inadvertently propagating security\nvulnerabilities. Despite numerous studies investigating the safety of code\nLLMs, there remains a gap in comprehensively addressing their security\nfeatures. In this work, we aim to present a comprehensive study aimed at\nprecisely evaluating and enhancing the security aspects of code LLMs. To\nsupport our research, we introduce CodeSecEval, a meticulously curated dataset\ndesigned to address 44 critical vulnerability types with 180 distinct samples.\nCodeSecEval serves as the foundation for the automatic evaluation of code\nmodels in two crucial tasks: code generation and code repair, with a strong\nemphasis on security. Our experimental results reveal that current models\nfrequently overlook security issues during both code generation and repair\nprocesses, resulting in the creation of vulnerable code. In response, we\npropose different strategies that leverage vulnerability-aware information and\ninsecure code explanations to mitigate these security vulnerabilities.\nFurthermore, our findings highlight that certain vulnerability types\nparticularly challenge model performance, influencing their effectiveness in\nreal-world applications. Based on these findings, we believe our study will\nhave a positive impact on the software engineering community, inspiring the\ndevelopment of improved methods for training and utilizing LLMs, thereby\nleading to safer and more trustworthy model deployment.\n","authors":["Jiexin Wang","Xitong Luo","Liuwen Cao","Hongkui He","Hailin Huang","Jiayuan Xie","Adam Jatowt","Yi Cai"],"pdf_url":"https://arxiv.org/pdf/2407.02395v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2310.16263"},{"id":"http://arxiv.org/abs/2407.02389v1","updated":"2024-07-02T16:02:25Z","published":"2024-07-02T16:02:25Z","title":"SafaRi:Adaptive Sequence Transformer for Weakly Supervised Referring\n Expression Segmentation","summary":" Referring Expression Segmentation (RES) aims to provide a segmentation mask\nof the target object in an image referred to by the text (i.e., referring\nexpression). Existing methods require large-scale mask annotations. Moreover,\nsuch approaches do not generalize well to unseen/zero-shot scenarios. To\naddress the aforementioned issues, we propose a weakly-supervised bootstrapping\narchitecture for RES with several new algorithmic innovations. To the best of\nour knowledge, ours is the first approach that considers only a fraction of\nboth mask and box annotations (shown in Figure 1 and Table 1) for training. To\nenable principled training of models in such low-annotation settings, improve\nimage-text region-level alignment, and further enhance spatial localization of\nthe target object in the image, we propose Cross-modal Fusion with Attention\nConsistency module. For automatic pseudo-labeling of unlabeled samples, we\nintroduce a novel Mask Validity Filtering routine based on a spatially aware\nzero-shot proposal scoring approach. Extensive experiments show that with just\n30% annotations, our model SafaRi achieves 59.31 and 48.26 mIoUs as compared to\n58.93 and 48.19 mIoUs obtained by the fully-supervised SOTA method SeqTR\nrespectively on RefCOCO+@testA and RefCOCO+testB datasets. SafaRi also\noutperforms SeqTR by 11.7% (on RefCOCO+testA) and 19.6% (on RefCOCO+testB) in a\nfully-supervised setting and demonstrates strong generalization capabilities in\nunseen/zero-shot tasks.\n","authors":["Sayan Nag","Koustava Goswami","Srikrishna Karanam"],"pdf_url":"https://arxiv.org/pdf/2407.02389v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2406.14099v2","updated":"2024-07-02T15:38:17Z","published":"2024-06-20T08:24:57Z","title":"Let Guidelines Guide You: A Prescriptive Guideline-Centered Data\n Annotation Methodology","summary":" We introduce the Guideline-Centered annotation process, a novel data\nannotation methodology focused on reporting the annotation guidelines\nassociated with each data sample. We identify three main limitations of the\nstandard prescriptive annotation process and describe how the\nGuideline-Centered methodology overcomes them by reducing the loss of\ninformation in the annotation process and ensuring adherence to guidelines.\nAdditionally, we discuss how the Guideline-Centered enables the reuse of\nannotated data across multiple tasks at the cost of a single human-annotation\nprocess.\n","authors":["Federico Ruggeri","Eleonora Misino","Arianna Muti","Katerina Korre","Paolo Torroni","Alberto Barrón-Cedeño"],"pdf_url":"https://arxiv.org/pdf/2406.14099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02354v1","updated":"2024-07-02T15:19:46Z","published":"2024-07-02T15:19:46Z","title":"Talking to Machines: do you read me?","summary":" In this dissertation I would like to guide the reader to the research on\ndialogue but more precisely the research I have conducted during my career\nsince my PhD thesis. Starting from modular architectures with machine\nlearning/deep learning and reinforcement learning to end-to-end deep neural\nnetworks. Besides my work as research associate, I also present the work I have\nsupervised in the last years.\n I review briefly the state of the art and highlight the open research\nproblems on conversational agents. Afterwards, I present my contribution to\nTask-Oriented Dialogues (TOD), both as research associate and as the industrial\nsupervisor of CIFRE theses. I discuss conversational QA. Particularly, I\npresent the work of two PhD candidates Thibault Cordier and Sebastien Montella;\nas well as the work of the young researcher Quentin Brabant. Finally, I present\nthe scientific project, where I discuss about Large Language Models (LLMs) for\nTask-Oriented Dialogue and Multimodal Task-Oriented Dialogue.\n","authors":["Lina M. Rojas-Barahona"],"pdf_url":"https://arxiv.org/pdf/2407.02354v1.pdf","comment":"French Doctoral Habilitation HDR manuscript:\n https://hal.science/tel-04620199"},{"id":"http://arxiv.org/abs/2407.02352v1","updated":"2024-07-02T15:17:44Z","published":"2024-07-02T15:17:44Z","title":"Pelican: Correcting Hallucination in Vision-LLMs via Claim Decomposition\n and Program of Thought Verification","summary":" Large Visual Language Models (LVLMs) struggle with hallucinations in visual\ninstruction following task(s), limiting their trustworthiness and real-world\napplicability. We propose Pelican -- a novel framework designed to detect and\nmitigate hallucinations through claim verification. Pelican first decomposes\nthe visual claim into a chain of sub-claims based on first-order predicates.\nThese sub-claims consist of (predicate, question) pairs and can be\nconceptualized as nodes of a computational graph. We then use\nProgram-of-Thought prompting to generate Python code for answering these\nquestions through flexible composition of external tools. Pelican improves over\nprior work by introducing (1) intermediate variables for precise grounding of\nobject instances, and (2) shared computation for answering the sub-question to\nenable adaptive corrections and inconsistency identification. We finally use\nreasoning abilities of LLM to verify the correctness of the the claim by\nconsidering the consistency and confidence of the (question, answer) pairs from\neach sub-claim. Our experiments reveal a drop in hallucination rate by\n$\\sim$8%-32% across various baseline LVLMs and a 27% drop compared to\napproaches proposed for hallucination mitigation on MMHal-Bench. Results on two\nother benchmarks further corroborate our results.\n","authors":["Pritish Sahu","Karan Sikka","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2407.02352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02351v1","updated":"2024-07-02T15:16:46Z","published":"2024-07-02T15:16:46Z","title":"Generative Large Language Models in Automated Fact-Checking: A Survey","summary":" The dissemination of false information across online platforms poses a\nserious societal challenge, necessitating robust measures for information\nverification. While manual fact-checking efforts are still instrumental, the\ngrowing volume of false information requires automated methods. Large language\nmodels (LLMs) offer promising opportunities to assist fact-checkers, leveraging\nLLM's extensive knowledge and robust reasoning capabilities. In this survey\npaper, we investigate the utilization of generative LLMs in the realm of\nfact-checking, illustrating various approaches that have been employed and\ntechniques for prompting or fine-tuning LLMs. By providing an overview of\nexisting approaches, this survey aims to improve the understanding of utilizing\nLLMs in fact-checking and to facilitate further progress in LLMs' involvement\nin this process.\n","authors":["Ivan Vykopal","Matúš Pikuliak","Simon Ostermann","Marián Šimko"],"pdf_url":"https://arxiv.org/pdf/2407.02351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02345v1","updated":"2024-07-02T15:12:34Z","published":"2024-07-02T15:12:34Z","title":"MORPHEUS: Modeling Role from Personalized Dialogue History by Exploring\n and Utilizing Latent Space","summary":" Personalized Dialogue Generation (PDG) aims to create coherent responses\naccording to roles or personas. Traditional PDG relies on external role data,\nwhich can be scarce and raise privacy concerns. Approaches address these issues\nby extracting role information from dialogue history, which often fail to\ngenerically model roles in continuous space. To overcome these limitations, we\nintroduce a novel framework \\textbf{MO}dels \\textbf{R}oles from\n\\textbf{P}ersonalized Dialogue \\textbf{H}istory by \\textbf{E}xploring and\n\\textbf{U}tilizing Latent \\textbf{S}pace (MORPHEUS) through a three-stage\ntraining process. Specifically, we create a persona codebook to represent roles\nin latent space compactly, and this codebook is used to construct a posterior\ndistribution of role information. This method enables the model to generalize\nacross roles, allowing the generation of personalized dialogues even for unseen\nroles. Experiments on both Chinese and English datasets demonstrate that\nMORPHEUS enhances the extraction of role information, and improves response\ngeneration without external role data. Additionally, MORPHEUS can be considered\nan efficient fine-tuning for large language models.\n","authors":["Yihong Tang","Bo Wang","Dongming Zhao","Xiaojia Jin","Jijun Zhang","Ruifang He","Yuexian Hou"],"pdf_url":"https://arxiv.org/pdf/2407.02345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02340v1","updated":"2024-07-02T15:07:54Z","published":"2024-07-02T15:07:54Z","title":"RVISA: Reasoning and Verification for Implicit Sentiment Analysis","summary":" With an increasing social demand for fine-grained sentiment analysis (SA),\nimplicit sentiment analysis (ISA) poses a significant challenge with the\nabsence of salient cue words in expressions. It necessitates reliable reasoning\nto understand how the sentiment is aroused and thus determine implicit\nsentiments. In the era of Large Language Models (LLMs), Encoder-Decoder (ED)\nLLMs have gained popularity to serve as backbone models for SA applications,\nconsidering impressive text comprehension and reasoning ability among diverse\ntasks. On the other hand, Decoder-only (DO) LLMs exhibit superior natural\nlanguage generation and in-context learning capabilities. However, their\nresponses may contain misleading or inaccurate information. To identify\nimplicit sentiment with reliable reasoning, this study proposes RVISA, a\ntwo-stage reasoning framework that harnesses the generation ability of DO LLMs\nand the reasoning ability of ED LLMs to train an enhanced reasoner.\nSpecifically, we adopt three-hop reasoning prompting to explicitly furnish\nsentiment elements as cues. The generated rationales are utilized to fine-tune\nan ED LLM into a skilled reasoner. Additionally, we develop a straightforward\nyet effective verification mechanism to ensure the reliability of the reasoning\nlearning. We evaluated the proposed method on two benchmark datasets and\nachieved state-of-the-art results in ISA performance.\n","authors":["Wenna Lai","Haoran Xie","Guandong Xu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.02340v1.pdf","comment":"11 pages, 6 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2407.02337v1","updated":"2024-07-02T15:05:47Z","published":"2024-07-02T15:05:47Z","title":"Open foundation models for Azerbaijani language","summary":" The emergence of multilingual large language models has enabled the\ndevelopment of language understanding and generation systems in Azerbaijani.\nHowever, most of the production-grade systems rely on cloud solutions, such as\nGPT-4. While there have been several attempts to develop open foundation models\nfor Azerbaijani, these works have not found their way into common use due to a\nlack of systemic benchmarking. This paper encompasses several lines of work\nthat promote open-source foundation models for Azerbaijani. We introduce (1) a\nlarge text corpus for Azerbaijani, (2) a family of encoder-only language models\ntrained on this dataset, (3) labeled datasets for evaluating these models, and\n(4) extensive evaluation that covers all major open-source models with\nAzerbaijani support.\n","authors":["Jafar Isbarov","Kavsar Huseynova","Elvin Mammadov","Mammad Hajili"],"pdf_url":"https://arxiv.org/pdf/2407.02337v1.pdf","comment":"Accepted to the 1st SIGTURK Workshop"},{"id":"http://arxiv.org/abs/2407.02333v1","updated":"2024-07-02T15:01:55Z","published":"2024-07-02T15:01:55Z","title":"Why do LLaVA Vision-Language Models Reply to Images in English?","summary":" We uncover a surprising multilingual bias occurring in a popular class of\nmultimodal vision-language models (VLMs). Including an image in the query to a\nLLaVA-style VLM significantly increases the likelihood of the model returning\nan English response, regardless of the language of the query. This paper\ninvestigates the causes of this loss with a two-pronged approach that combines\nextensive ablation of the design space with a mechanistic analysis of the\nmodels' internal representations of image and text inputs. Both approaches\nindicate that the issue stems in the language modelling component of the LLaVA\nmodel. Statistically, we find that switching the language backbone for a\nbilingual language model has the strongest effect on reducing this error.\nMechanistically, we provide compelling evidence that visual inputs are not\nmapped to a similar space as text ones, and that intervening on intermediary\nattention layers can reduce this bias. Our findings provide important insights\nto researchers and engineers seeking to understand the crossover between\nmultimodal and multilingual spaces, and contribute to the goal of developing\ncapable and inclusive VLMs for non-English contexts.\n","authors":["Musashi Hinck","Carolin Holtermann","Matthew Lyle Olson","Florian Schneider","Sungduk Yu","Anahita Bhiwandiwalla","Anne Lauscher","Shaoyen Tseng","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2407.02333v1.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2407.02328v1","updated":"2024-07-02T14:58:44Z","published":"2024-07-02T14:58:44Z","title":"Efficient Sparse Attention needs Adaptive Token Release","summary":" In recent years, Large Language Models (LLMs) have demonstrated remarkable\ncapabilities across a wide array of text-centric tasks. However, their `large'\nscale introduces significant computational and storage challenges, particularly\nin managing the key-value states of the transformer, which limits their wider\napplicability. Therefore, we propose to adaptively release resources from\ncaches and rebuild the necessary key-value states. Particularly, we accomplish\nthis by a lightweight controller module to approximate an ideal top-$K$ sparse\nattention. This module retains the tokens with the highest top-$K$ attention\nweights and simultaneously rebuilds the discarded but necessary tokens, which\nmay become essential for future decoding. Comprehensive experiments in natural\nlanguage generation and modeling reveal that our method is not only competitive\nwith full attention in terms of performance but also achieves a significant\nthroughput improvement of up to 221.8%. The code for replication is available\non the https://github.com/WHUIR/ADORE.\n","authors":["Chaoran Zhang","Lixin Zou","Dan Luo","Min Tang","Xiangyang Luo","Zihao Li","Chenliang Li"],"pdf_url":"https://arxiv.org/pdf/2407.02328v1.pdf","comment":"Accepted at ACL 2024(Findings)"},{"id":"http://arxiv.org/abs/2407.02320v1","updated":"2024-07-02T14:51:20Z","published":"2024-07-02T14:51:20Z","title":"Exploring the Role of Transliteration in In-Context Learning for\n Low-resource Languages Written in Non-Latin Scripts","summary":" Decoder-only large language models (LLMs) excel in high-resource languages\nacross various tasks through few-shot or even zero-shot in-context learning\n(ICL). However, their performance often does not transfer well to low-resource\nlanguages, especially those written in non-Latin scripts. Inspired by recent\nwork that leverages transliteration in encoder-only models, we investigate\nwhether transliteration is also effective in improving LLMs' performance for\nlow-resource languages written in non-Latin scripts. To this end, we propose\nthree prompt templates, where the target-language text is represented in (1)\nits original script, (2) Latin script, or (3) both. We apply these methods to\nseveral representative LLMs of different sizes on various tasks including text\nclassification and sequential labeling. Our findings show that the\neffectiveness of transliteration varies by task type and model size. For\ninstance, all models benefit from transliterations for sequential labeling\n(with increases of up to 25%).\n","authors":["Chunlan Ma","Yihong Liu","Haotian Ye","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2407.02320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02317v1","updated":"2024-07-02T14:50:03Z","published":"2024-07-02T14:50:03Z","title":"Soft Language Prompts for Language Transfer","summary":" Cross-lingual knowledge transfer, especially between high- and low-resource\nlanguages, remains a challenge in natural language processing (NLP). This study\noffers insights for improving cross-lingual NLP applications through the\ncombination of parameter-efficient fine-tuning methods. We systematically\nexplore strategies for enhancing this cross-lingual transfer through the\nincorporation of language-specific and task-specific adapters and soft prompts.\nWe present a detailed investigation of various combinations of these methods,\nexploring their efficiency across six languages, focusing on three low-resource\nlanguages, including the to our knowledge first use of soft language prompts.\nOur findings demonstrate that in contrast to claims of previous work, a\ncombination of language and task adapters does not always work best; instead,\ncombining a soft language prompt with a task adapter outperforms other\nconfigurations in many cases.\n","authors":["Ivan Vykopal","Simon Ostermann","Marián Šimko"],"pdf_url":"https://arxiv.org/pdf/2407.02317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02310v1","updated":"2024-07-02T14:44:49Z","published":"2024-07-02T14:44:49Z","title":"Evaluating the Ability of LLMs to Solve Semantics-Aware Process Mining\n Tasks","summary":" The process mining community has recently recognized the potential of large\nlanguage models (LLMs) for tackling various process mining tasks. Initial\nstudies report the capability of LLMs to support process analysis and even, to\nsome extent, that they are able to reason about how processes work. This latter\nproperty suggests that LLMs could also be used to tackle process mining tasks\nthat benefit from an understanding of process behavior. Examples of such tasks\ninclude (semantic) anomaly detection and next activity prediction, which both\ninvolve considerations of the meaning of activities and their inter-relations.\nIn this paper, we investigate the capabilities of LLMs to tackle such\nsemantics-aware process mining tasks. Furthermore, whereas most works on the\nintersection of LLMs and process mining only focus on testing these models out\nof the box, we provide a more principled investigation of the utility of LLMs\nfor process mining, including their ability to obtain process mining knowledge\npost-hoc by means of in-context learning and supervised fine-tuning.\nConcretely, we define three process mining tasks that benefit from an\nunderstanding of process semantics and provide extensive benchmarking datasets\nfor each of them. Our evaluation experiments reveal that (1) LLMs fail to solve\nchallenging process mining tasks out of the box and when provided only a\nhandful of in-context examples, (2) but they yield strong performance when\nfine-tuned for these tasks, consistently surpassing smaller, encoder-based\nlanguage models.\n","authors":["Adrian Rebmann","Fabian David Schmidt","Goran Glavaš","Han van der Aa"],"pdf_url":"https://arxiv.org/pdf/2407.02310v1.pdf","comment":"Submitted to ICPM"},{"id":"http://arxiv.org/abs/2407.02302v1","updated":"2024-07-02T14:35:10Z","published":"2024-07-02T14:35:10Z","title":"Towards Human Understanding of Paraphrase Types in ChatGPT","summary":" Paraphrases represent a human's intuitive ability to understand expressions\npresented in various different ways. Current paraphrase evaluations of language\nmodels primarily use binary approaches, offering limited interpretability of\nspecific text changes. Atomic paraphrase types (APT) decompose paraphrases into\ndifferent linguistic changes and offer a granular view of the flexibility in\nlinguistic expression (e.g., a shift in syntax or vocabulary used). In this\nstudy, we assess the human preferences towards ChatGPT in generating English\nparaphrases with ten APTs and five prompting techniques. We introduce APTY\n(Atomic Paraphrase TYpes), a dataset of 500 sentence-level and word-level\nannotations by 15 annotators. The dataset also provides a human preference\nranking of paraphrases with different types that can be used to fine-tune\nmodels with RLHF and DPO methods. Our results reveal that ChatGPT can generate\nsimple APTs, such as additions and deletions, but struggle with complex\nstructures (e.g., subordination changes). This study contributes to\nunderstanding which aspects of paraphrasing language models have already\nsucceeded at understanding and what remains elusive. In addition, our curated\ndatasets can be used to develop language models with specific linguistic\ncapabilities.\n","authors":["Dominik Meier","Jan Philip Wahle","Terry Ruas","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2407.02302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02301v1","updated":"2024-07-02T14:34:36Z","published":"2024-07-02T14:34:36Z","title":"CFinBench: A Comprehensive Chinese Financial Benchmark for Large\n Language Models","summary":" Large language models (LLMs) have achieved remarkable performance on various\nNLP tasks, yet their potential in more challenging and domain-specific task,\nsuch as finance, has not been fully explored. In this paper, we present\nCFinBench: a meticulously crafted, the most comprehensive evaluation benchmark\nto date, for assessing the financial knowledge of LLMs under Chinese context.\nIn practice, to better align with the career trajectory of Chinese financial\npractitioners, we build a systematic evaluation from 4 first-level categories:\n(1) Financial Subject: whether LLMs can memorize the necessary basic knowledge\nof financial subjects, such as economics, statistics and auditing. (2)\nFinancial Qualification: whether LLMs can obtain the needed financial qualified\ncertifications, such as certified public accountant, securities qualification\nand banking qualification. (3) Financial Practice: whether LLMs can fulfill the\npractical financial jobs, such as tax consultant, junior accountant and\nsecurities analyst. (4) Financial Law: whether LLMs can meet the requirement of\nfinancial laws and regulations, such as tax law, insurance law and economic\nlaw. CFinBench comprises 99,100 questions spanning 43 second-level categories\nwith 3 question types: single-choice, multiple-choice and judgment. We conduct\nextensive experiments of 50 representative LLMs with various model size on\nCFinBench. The results show that GPT4 and some Chinese-oriented models lead the\nbenchmark, with the highest average accuracy being 60.16%, highlighting the\nchallenge presented by CFinBench. The dataset and evaluation code are available\nat https://cfinbench.github.io/.\n","authors":["Ying Nie","Binwei Yan","Tianyu Guo","Hao Liu","Haoyu Wang","Wei He","Binfan Zheng","Weihao Wang","Qiang Li","Weijian Sun","Yunhe Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2407.02301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12038v3","updated":"2024-07-02T14:17:57Z","published":"2024-04-18T09:46:25Z","title":"Uncovering Safety Risks of Large Language Models through Concept\n Activation Vector","summary":" Despite careful safety alignment, current large language models (LLMs) remain\nvulnerable to various attacks. To further unveil the safety risks of LLMs, we\nintroduce a Safety Concept Activation Vector (SCAV) framework, which\neffectively guides the attacks by accurately interpreting LLMs' safety\nmechanisms. We then develop an SCAV-guided attack method that can generate both\nattack prompts and embedding-level attacks with automatically selected\nperturbation hyperparameters. Both automatic and human evaluations demonstrate\nthat our attack method significantly improves the attack success rate and\nresponse quality while requiring less training data. Additionally, we find that\nour generated attack prompts may be transferable to GPT-4, and the\nembedding-level attacks may also be transferred to other white-box LLMs whose\nparameters are known. Our experiments further uncover the safety risks present\nin current LLMs. For example, we find that six out of seven open-source LLMs\nthat we attack consistently provide relevant answers to more than 85\\%\nmalicious instructions. Finally, we provide insights into the safety mechanism\nof LLMs.\n","authors":["Zhihao Xu","Ruixuan Huang","Changyu Chen","Shuai Wang","Xiting Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12038v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10938v2","updated":"2024-07-02T14:16:42Z","published":"2024-05-17T17:49:44Z","title":"Observational Scaling Laws and the Predictability of Language Model\n Performance","summary":" Understanding how language model performance varies with scale is critical to\nbenchmark and algorithm development. Scaling laws are one approach to building\nthis understanding, but the requirement of training models across many\ndifferent scales has limited their use. We propose an alternative,\nobservational approach that bypasses model training and instead builds scaling\nlaws from ~80 publically available models. Building a single scaling law from\nmultiple model families is challenging due to large variations in their\ntraining compute efficiencies and capabilities. However, we show that these\nvariations are consistent with a simple, generalized scaling law where language\nmodel performance is a function of a low-dimensional capability space, and\nmodel families only vary in their efficiency in converting training compute to\ncapabilities. Using this approach, we show the surprising predictability of\ncomplex scaling phenomena: we show that several emergent phenomena follow a\nsmooth, sigmoidal behavior and are predictable from small models; we show that\nthe agent performance of models such as GPT-4 can be precisely predicted from\nsimpler non-agentic benchmarks; and we show how to predict the impact of\npost-training interventions like Chain-of-Thought and Self-Consistency as\nlanguage model capabilities continue to improve.\n","authors":["Yangjun Ruan","Chris J. Maddison","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2405.10938v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02284v1","updated":"2024-07-02T14:14:59Z","published":"2024-07-02T14:14:59Z","title":"Renard: A Modular Pipeline for Extracting Character Networks from\n Narrative Texts","summary":" Renard (Relationships Extraction from NARrative Documents) is a Python\nlibrary that allows users to define custom natural language processing (NLP)\npipelines to extract character networks from narrative texts. Contrary to the\nfew existing tools, Renard can extract dynamic networks, as well as the more\ncommon static networks. Renard pipelines are modular: users can choose the\nimplementation of each NLP subtask needed to extract a character network. This\nallows users to specialize pipelines to particular types of texts and to study\nthe impact of each subtask on the extracted network.\n","authors":["Arthur Amalvy","Vincent Labatut","Richard Dufour"],"pdf_url":"https://arxiv.org/pdf/2407.02284v1.pdf","comment":"Accepted at JOSS"},{"id":"http://arxiv.org/abs/2406.16563v2","updated":"2024-07-02T14:14:48Z","published":"2024-06-24T11:58:33Z","title":"Are there identifiable structural parts in the sentence embedding whole?","summary":" Sentence embeddings from transformer models encode in a fixed length vector\nmuch linguistic information. We explore the hypothesis that these embeddings\nconsist of overlapping layers of information that can be separated, and on\nwhich specific types of information -- such as information about chunks and\ntheir structural and semantic properties -- can be detected. We show that this\nis the case using a dataset consisting of sentences with known chunk structure,\nand two linguistic intelligence datasets, solving which relies on detecting\nchunks and their grammatical number, and respectively, their semantic roles,\nand through analyses of the performance on the tasks and of the internal\nrepresentations built during learning.\n","authors":["Vivi Nastase","Paola Merlo"],"pdf_url":"https://arxiv.org/pdf/2406.16563v2.pdf","comment":"17 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.07825v2","updated":"2024-07-02T14:07:56Z","published":"2024-03-12T17:04:28Z","title":"The Missing Piece in Model Editing: A Deep Dive into the Hidden Damage\n Brought By Model Editing","summary":" Large Language Models have revolutionized numerous tasks with their\nremarkable efficacy. However, editing these models, crucial for rectifying\noutdated or erroneous information, often leads to a complex issue known as the\nripple effect in the hidden space. While difficult to detect, this effect can\nsignificantly impede the efficacy of model editing tasks and deteriorate model\nperformance. This paper addresses this scientific challenge by proposing a\nnovel evaluation methodology, Graphical Impact Evaluation(GIE), which\nquantitatively evaluates the adaptations of the model and the subsequent impact\nof editing. Furthermore, we introduce the Selective Impact Revision(SIR), a\nmodel editing method designed to mitigate this ripple effect. Our comprehensive\nevaluations reveal that the ripple effect in the hidden space is a significant\nissue in all current model editing methods. However, our proposed methods, GIE\nand SIR, effectively identify and alleviate this issue, contributing to the\nadvancement of LLM editing techniques.\n","authors":["Jianchen Wang","Zhouhong Gu","Xiaoxuan Zhu","Lin Zhang","Haoning Ye","Zhuozhi Xiong","Hongwei Feng","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.07825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02273v1","updated":"2024-07-02T14:02:53Z","published":"2024-07-02T14:02:53Z","title":"Multilingual Trolley Problems for Language Models","summary":" As large language models (LLMs) are deployed in more and more real-world\nsituations, it is crucial to understand their decision-making when faced with\nmoral dilemmas. Inspired by a large-scale cross-cultural study of human moral\npreferences, \"The Moral Machine Experiment\", we set up the same set of moral\nchoices for LLMs. We translate 1K vignettes of moral dilemmas, parametrically\nvaried across key axes, into 100+ languages, and reveal the preferences of LLMs\nin each of these languages. We then compare the responses of LLMs to that of\nhuman speakers of those languages, harnessing a dataset of 40 million human\nmoral judgments. We discover that LLMs are more aligned with human preferences\nin languages such as English, Korean, Hungarian, and Chinese, but less aligned\nin languages such as Hindi and Somali (in Africa). Moreover, we characterize\nthe explanations LLMs give for their moral choices and find that fairness is\nthe most dominant supporting reason behind GPT-4's decisions and utilitarianism\nby GPT-3. We also discover \"language inequality\" (which we define as the\nmodel's different development levels in different languages) in a series of\nmeta-properties of moral decision making.\n","authors":["Zhijing Jin","Sydney Levine","Max Kleiman-Weiner","Giorgio Piatti","Jiarui Liu","Fernando Gonzalez Adauto","Francesco Ortu","András Strausz","Mrinmaya Sachan","Rada Mihalcea","Yejin Choi","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2407.02273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14986v2","updated":"2024-07-02T14:02:04Z","published":"2024-06-21T08:56:35Z","title":"Do Large Language Models Exhibit Cognitive Dissonance? Studying the\n Difference Between Revealed Beliefs and Stated Answers","summary":" Prompting and Multiple Choices Questions (MCQ) have become the preferred\napproach to assess the capabilities of Large Language Models (LLMs), due to\ntheir ease of manipulation and evaluation. Such experimental appraisals have\npointed toward the LLMs' apparent ability to perform causal reasoning or to\ngrasp uncertainty. In this paper, we investigate whether these abilities are\nmeasurable outside of tailored prompting and MCQ by reformulating these issues\nas direct text completion - the foundation of LLMs. To achieve this goal, we\ndefine scenarios with multiple possible outcomes and we compare the prediction\nmade by the LLM through prompting (their Stated Answer) to the probability\ndistributions they compute over these outcomes during next token prediction\n(their Revealed Belief). Our findings suggest that the Revealed Belief of LLMs\nsignificantly differs from their Stated Answer and hint at multiple biases and\nmisrepresentations that their beliefs may yield in many scenarios and outcomes.\nAs text completion is at the core of LLMs, these results suggest that common\nevaluation methods may only provide a partial picture and that more research is\nneeded to assess the extent and nature of their capabilities.\n","authors":["Manuel Mondal","Ljiljana Dolamic","Gérôme Bovet","Philippe Cudré-Mauroux","Julien Audiffren"],"pdf_url":"https://arxiv.org/pdf/2406.14986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12726v2","updated":"2024-07-02T14:01:27Z","published":"2024-04-19T09:10:29Z","title":"Evaluating Character Understanding of Large Language Models via\n Character Profiling from Fictional Works","summary":" Large language models (LLMs) have demonstrated impressive performance and\nspurred numerous AI applications, in which role-playing agents (RPAs) are\nparticularly popular, especially for fictional characters. The prerequisite for\nthese RPAs lies in the capability of LLMs to understand characters from\nfictional works. Previous efforts have evaluated this capability via basic\nclassification tasks or characteristic imitation, failing to capture the\nnuanced character understanding with LLMs. In this paper, we propose evaluating\nLLMs' character understanding capability via the character profiling task,\ni.e., summarizing character profiles from corresponding materials, a widely\nadopted yet understudied practice for RPA development. Specifically, we\nconstruct the CroSS dataset from literature experts and assess the generated\nprofiles by comparing ground truth references and their applicability in\ndownstream tasks. Our experiments, which cover various summarization methods\nand LLMs, have yielded promising results. These results strongly validate the\ncharacter understanding capability of LLMs. Resources are available at\nhttps://github.com/Joanna0123/character_profiling.\n","authors":["Xinfeng Yuan","Siyu Yuan","Yuhan Cui","Tianhe Lin","Xintao Wang","Rui Xu","Jiangjie Chen","Deqing Yang"],"pdf_url":"https://arxiv.org/pdf/2404.12726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01280v2","updated":"2024-07-02T13:41:56Z","published":"2024-05-02T13:39:28Z","title":"Reinforcement Learning for Edit-Based Non-Autoregressive Neural Machine\n Translation","summary":" Non-autoregressive (NAR) language models are known for their low latency in\nneural machine translation (NMT). However, a performance gap exists between NAR\nand autoregressive models due to the large decoding space and difficulty in\ncapturing dependency between target words accurately. Compounding this,\npreparing appropriate training data for NAR models is a non-trivial task, often\nexacerbating exposure bias. To address these challenges, we apply reinforcement\nlearning (RL) to Levenshtein Transformer, a representative edit-based NAR\nmodel, demonstrating that RL with self-generated data can enhance the\nperformance of edit-based NAR models. We explore two RL approaches: stepwise\nreward maximization and episodic reward maximization. We discuss the respective\npros and cons of these two approaches and empirically verify them. Moreover, we\nexperimentally investigate the impact of temperature setting on performance,\nconfirming the importance of proper temperature setting for NAR models'\ntraining.\n","authors":["Hao Wang","Tetsuro Morimura","Ukyo Honda","Daisuke Kawahara"],"pdf_url":"https://arxiv.org/pdf/2405.01280v2.pdf","comment":"NAACL SRW 2024"},{"id":"http://arxiv.org/abs/2305.12759v2","updated":"2024-07-02T13:39:31Z","published":"2023-05-22T06:30:02Z","title":"Kanbun-LM: Reading and Translating Classical Chinese in Japanese Methods\n by Language Models","summary":" Recent studies in natural language processing (NLP) have focused on modern\nlanguages and achieved state-of-the-art results in many tasks. Meanwhile,\nlittle attention has been paid to ancient texts and related tasks. Classical\nChinese first came to Japan approximately 2,000 years ago. It was gradually\nadapted to a Japanese form called Kanbun-Kundoku (Kanbun) in Japanese reading\nand translating methods, which has significantly impacted Japanese literature.\nHowever, compared to the rich resources for ancient texts in mainland China,\nKanbun resources remain scarce in Japan. To solve this problem, we construct\nthe first Classical-Chinese-to-Kanbun dataset in the world. Furthermore, we\nintroduce two tasks, character reordering and machine translation, both of\nwhich play a significant role in Kanbun comprehension. We also test the current\nlanguage models on these tasks and discuss the best evaluation method by\ncomparing the results with human scores. We release our code and dataset on\nGitHub.\n","authors":["Hao Wang","Hirofumi Shimizu","Daisuke Kawahara"],"pdf_url":"https://arxiv.org/pdf/2305.12759v2.pdf","comment":"Findings of ACL 2023"},{"id":"http://arxiv.org/abs/2406.12213v2","updated":"2024-07-02T13:07:07Z","published":"2024-06-18T02:25:33Z","title":"LLM-Oracle Machines","summary":" Contemporary AI applications leverage large language models (LLMs) to harness\ntheir knowledge and reasoning abilities for natural language processing tasks.\nThis approach shares similarities with the concept of oracle Turing machines\n(OTMs). To capture the broader potential of these computations, including those\nnot yet realized, we propose an extension to OTMs: the LLM-oracle machine\n(LLM-OM), by employing a cluster of LLMs as the oracle. Each LLM acts as a\nblack box, capable of answering queries within its expertise, albeit with a\ndelay. We introduce four variants of the LLM-OM: basic, augmented,\nfault-avoidance, and $\\epsilon$-fault. The first two are commonly observed in\nexisting AI applications. The latter two are specifically designed to address\nthe challenges of LLM hallucinations, biases, and inconsistencies, aiming to\nensure reliable outcomes.\n","authors":["Jie Wang"],"pdf_url":"https://arxiv.org/pdf/2406.12213v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2407.02243v1","updated":"2024-07-02T13:04:04Z","published":"2024-07-02T13:04:04Z","title":"Robust Zero-Shot Text-to-Speech Synthesis with Reverse Inference\n Optimization","summary":" In this paper, we propose reverse inference optimization (RIO), a simple and\neffective method designed to enhance the robustness of\nautoregressive-model-based zero-shot text-to-speech (TTS) systems using\nreinforcement learning from human feedback (RLHF). To assess the quality of\nspeech produced by the TTS system without human annotations, RIO introduces a\nnovel concept termed as reverse inference based on the Bayesian principle,\nwhich suggests that a high-quality generated speech should be able to be used\nas a prompt for subsequent generation using the same TTS model. By leveraging\nreverse inference as the standard to select exemplars used in RLHF from the\nspeech samples generated by the TTS system itself, RIO steers the subsequent\noptimization towards a direction of enhancing the TTS robustness. The RIO\nframework, comprising sampling, automatic annotating, and learning, obviates\nthe need for a reward model or pairwise preference data, and significantly\nimproves the stability of zero-shot TTS performance by reducing the\ndiscrepancies between training and inference conditions. Our experimental\nresults verify that RIO can effectively improve both subjective and objective\nmetrics, including mean opinion scores, word error rates, and speaker\nsimilarity. Remarkably, RIO can also diminish the incidence of bad outputs to\nnearly zero percent, rivalling the robustness when using ground-truth speech as\nthe prompt.\n","authors":["Yuchen Hu","Chen Chen","Siyin Wang","Eng Siong Chng","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02243v1.pdf","comment":"12 pages, Work in progress"},{"id":"http://arxiv.org/abs/2407.01449v2","updated":"2024-07-02T13:02:58Z","published":"2024-06-27T15:45:29Z","title":"ColPali: Efficient Document Retrieval with Vision Language Models","summary":" Documents are visually rich structures that convey information through text,\nas well as tables, figures, page layouts, or fonts. While modern document\nretrieval systems exhibit strong performance on query-to-text matching, they\nstruggle to exploit visual cues efficiently, hindering their performance on\npractical document retrieval applications such as Retrieval Augmented\nGeneration. To benchmark current systems on visually rich document retrieval,\nwe introduce the Visual Document Retrieval Benchmark ViDoRe, composed of\nvarious page-level retrieving tasks spanning multiple domains, languages, and\nsettings. The inherent shortcomings of modern systems motivate the introduction\nof a new retrieval model architecture, ColPali, which leverages the document\nunderstanding capabilities of recent Vision Language Models to produce\nhigh-quality contextualized embeddings solely from images of document pages.\nCombined with a late interaction matching mechanism, ColPali largely\noutperforms modern document retrieval pipelines while being drastically faster\nand end-to-end trainable.\n","authors":["Manuel Faysse","Hugues Sibille","Tony Wu","Bilel Omrani","Gautier Viaud","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2407.01449v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2407.02235v1","updated":"2024-07-02T12:58:35Z","published":"2024-07-02T12:58:35Z","title":"Towards a Holistic Framework for Multimodal Large Language Models in\n Three-dimensional Brain CT Report Generation","summary":" Multi-modal large language models (MLLMs) have been given free rein to\nexplore exciting medical applications with a primary focus on radiology report\ngeneration. Nevertheless, the preliminary success in 2D radiology captioning is\nincompetent to reflect the real-world diagnostic challenge in the volumetric 3D\nanatomy. To mitigate three crucial limitation aspects in the existing\nliterature, including (1) data complexity, (2) model capacity, and (3)\nevaluation metric fidelity, we collected an 18,885 text-scan pairs 3D-BrainCT\ndataset and applied clinical visual instruction tuning (CVIT) to train BrainGPT\nmodels to generate radiology-adherent 3D brain CT reports. Statistically, our\nBrainGPT scored BLEU-1 = 44.35, BLEU-4 = 20.38, METEOR = 30.13, ROUGE-L = 47.6,\nand CIDEr-R = 211.77 during internal testing and demonstrated an accuracy of\n0.91 in captioning midline shifts on the external validation CQ500 dataset. By\nfurther inspecting the captioned report, we reported that the traditional\nmetrics appeared to measure only the surface text similarity and failed to\ngauge the information density of the diagnostic purpose. To close this gap, we\nproposed a novel Feature-Oriented Radiology Task Evaluation (FORTE) to estimate\nthe report's clinical relevance (lesion feature and landmarks). Notably, the\nBrainGPT model scored an average FORTE F1-score of 0.71 (degree=0.661;\nlandmark=0.706; feature=0.693; impression=0.779). To demonstrate that BrainGPT\nmodels possess objective readiness to generate human-like radiology reports, we\nconducted a Turing test that enrolled 11 physician evaluators, and around 74%\nof the BrainGPT-generated captions were indistinguishable from those written by\nhumans. Our work embodies a holistic framework that showcased the first-hand\nexperience of curating a 3D brain CT dataset, fine-tuning anatomy-sensible\nlanguage models, and proposing robust radiology evaluation metrics.\n","authors":["Cheng-Yi Li","Kao-Jung Chang","Cheng-Fu Yang","Hsin-Yu Wu","Wenting Chen","Hritik Bansal","Ling Chen","Yi-Ping Yang","Yu-Chun Chen","Shih-Pin Chen","Jiing-Feng Lirng","Kai-Wei Chang","Shih-Hwa Chiou"],"pdf_url":"https://arxiv.org/pdf/2407.02235v1.pdf","comment":"6 figures, 5 supplementary figures, 8 supplementary tables"},{"id":"http://arxiv.org/abs/2407.02233v1","updated":"2024-07-02T12:57:42Z","published":"2024-07-02T12:57:42Z","title":"Synthetic Multimodal Question Generation","summary":" Multimodal Retrieval Augmented Generation (MMRAG) is a powerful approach to\nquestion-answering over multimodal documents. A key challenge with evaluating\nMMRAG is the paucity of high-quality datasets matching the question styles and\nmodalities of interest. In light of this, we propose SMMQG, a synthetic data\ngeneration framework. SMMQG leverages interplay between a retriever, large\nlanguage model (LLM) and large multimodal model (LMM) to generate question and\nanswer pairs directly from multimodal documents, with the questions conforming\nto specified styles and modalities. We use SMMQG to generate an MMRAG dataset\nof 1024 questions over Wikipedia documents and evaluate state-of-the-art models\nusing it, revealing insights into model performance that are attainable only\nthrough style- and modality-specific evaluation data. Next, we measure the\nquality of data produced by SMMQG via a human study. We find that the quality\nof our synthetic data is on par with the quality of the crowdsourced benchmark\nMMQA and that downstream evaluation results using both datasets strongly\nconcur.\n","authors":["Ian Wu","Sravan Jayanthi","Vijay Viswanathan","Simon Rosenberg","Sina Pakazad","Tongshuang Wu","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2407.02233v1.pdf","comment":"Submitted to ARR June 2024"},{"id":"http://arxiv.org/abs/2407.00463v2","updated":"2024-07-02T12:53:10Z","published":"2024-06-29T15:20:11Z","title":"Open-Source Conversational AI with SpeechBrain 1.0","summary":" SpeechBrain is an open-source Conversational AI toolkit based on PyTorch,\nfocused particularly on speech processing tasks such as speech recognition,\nspeech enhancement, speaker recognition, text-to-speech, and much more. It\npromotes transparency and replicability by releasing both the pre-trained\nmodels and the complete \"recipes\" of code and algorithms required for training\nthem. This paper presents SpeechBrain 1.0, a significant milestone in the\nevolution of the toolkit, which now has over 200 recipes for speech, audio, and\nlanguage processing tasks, and more than 100 models available on Hugging Face.\nSpeechBrain 1.0 introduces new technologies to support diverse learning\nmodalities, Large Language Model (LLM) integration, and advanced decoding\nstrategies, along with novel models, tasks, and modalities. It also includes a\nnew benchmark repository, offering researchers a unified platform for\nevaluating models across diverse tasks\n","authors":["Mirco Ravanelli","Titouan Parcollet","Adel Moumen","Sylvain de Langen","Cem Subakan","Peter Plantinga","Yingzhi Wang","Pooneh Mousavi","Luca Della Libera","Artem Ploujnikov","Francesco Paissan","Davide Borra","Salah Zaiem","Zeyu Zhao","Shucong Zhang","Georgios Karakasidis","Sung-Lin Yeh","Aku Rouhe","Rudolf Braun","Florian Mai","Juan Zuluaga-Gomez","Seyed Mahed Mousavi","Andreas Nautsch","Xuechen Liu","Sangeet Sagar","Jarod Duret","Salima Mdhaffar","Gaelle Laperriere","Renato De Mori","Yannick Esteve"],"pdf_url":"https://arxiv.org/pdf/2407.00463v2.pdf","comment":"Submitted to JMLR (Machine Learning Open Source Software)"},{"id":"http://arxiv.org/abs/2404.05091v4","updated":"2024-07-02T12:46:23Z","published":"2024-04-07T22:16:50Z","title":"MM-MATH: Advancing Multimodal Math Evaluation with Process Evaluation\n and Fine-grained Classification","summary":" To advance the evaluation of multimodal math reasoning in large multimodal\nmodels (LMMs), this paper introduces a novel benchmark, MM-MATH. MM-MATH\nconsists of 5,929 open-ended middle school math problems with visual contexts,\nwith fine-grained classification across difficulty, grade level, and knowledge\npoints. Unlike existing benchmarks relying on binary answer comparison, MM-MATH\nincorporates both outcome and process evaluations. Process evaluation employs\nLMM-as-a-judge to automatically analyze solution steps, identifying and\ncategorizing errors into specific error types. Extensive evaluation of ten\nmodels on MM-MATH reveals significant challenges for existing LMMs,\nhighlighting their limited utilization of visual information and struggles with\nhigher-difficulty problems. The best-performing model achieves only 31%\naccuracy on MM-MATH, compared to 82% for humans. This highlights the\nchallenging nature of our benchmark for existing models and the significant gap\nbetween the multimodal reasoning capabilities of current models and humans. Our\nprocess evaluation reveals that diagram misinterpretation is the most common\nerror, accounting for more than half of the total error cases, underscoring the\nneed for improved image comprehension in multimodal reasoning.\n","authors":["Kai Sun","Yushi Bai","Ji Qi","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2404.05091v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09390v2","updated":"2024-07-02T12:42:59Z","published":"2024-02-14T18:41:19Z","title":"HGOT: Hierarchical Graph of Thoughts for Retrieval-Augmented In-Context\n Learning in Factuality Evaluation","summary":" With the widespread adoption of large language models (LLMs) in numerous\napplications, the challenge of factuality and the propensity for hallucinations\nhas emerged as a significant concern. To address this issue, particularly in\nretrieval-augmented in-context learning, we introduce the hierarchical graph of\nthoughts (HGOT), a structured, multi-layered graph approach designed to enhance\nthe retrieval of pertinent passages during in-context learning. The framework\nutilizes the emergent planning capabilities of LLMs, employing the\ndivide-and-conquer strategy to break down complex queries into manageable\nsub-queries. It refines self-consistency majority voting for answer selection,\nwhich incorporates the recently proposed citation recall and precision metrics\nto assess the quality of thoughts, linking an answer's credibility\nintrinsically to the thought's quality. This methodology introduces a weighted\nsystem in majority voting, prioritizing answers based on the citation quality\nof their thoughts. Additionally, we propose a scoring mechanism for evaluating\nretrieved passages, considering factors such as citation frequency and quality,\nself-consistency confidence, and the retrieval module's ranking. Experiments\nindicate that HGOT excels as a versatile approach, outperforming competing\nmodels in FEVER by up to $7\\%$ and matching leading models such as\nRetrieve-then-Read in Open-SQuAD, and DSP in HotPotQA, demonstrating its\nefficacy in enhancing LLMs' factuality.\n","authors":["Yihao Fang","Stephen W. Thomas","Xiaodan Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.09390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16546v2","updated":"2024-07-02T12:23:37Z","published":"2024-05-26T12:30:20Z","title":"Cocktail: A Comprehensive Information Retrieval Benchmark with\n LLM-Generated Documents Integration","summary":" The proliferation of Large Language Models (LLMs) has led to an influx of\nAI-generated content (AIGC) on the internet, transforming the corpus of\nInformation Retrieval (IR) systems from solely human-written to a coexistence\nwith LLM-generated content. The impact of this surge in AIGC on IR systems\nremains an open question, with the primary challenge being the lack of a\ndedicated benchmark for researchers. In this paper, we introduce Cocktail, a\ncomprehensive benchmark tailored for evaluating IR models in this mixed-sourced\ndata landscape of the LLM era. Cocktail consists of 16 diverse datasets with\nmixed human-written and LLM-generated corpora across various text retrieval\ntasks and domains. Additionally, to avoid the potential bias from previously\nincluded dataset information in LLMs, we also introduce an up-to-date dataset,\nnamed NQ-UTD, with queries derived from recent events. Through conducting over\n1,000 experiments to assess state-of-the-art retrieval models against the\nbenchmarked datasets in Cocktail, we uncover a clear trade-off between ranking\nperformance and source bias in neural retrieval models, highlighting the\nnecessity for a balanced approach in designing future IR systems. We hope\nCocktail can serve as a foundational resource for IR research in the LLM era,\nwith all data and code publicly available at\n\\url{https://github.com/KID-22/Cocktail}.\n","authors":["Sunhao Dai","Weihao Liu","Yuqi Zhou","Liang Pang","Rongju Ruan","Gang Wang","Zhenhua Dong","Jun Xu","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.16546v2.pdf","comment":"Accepted by Findings of ACL 2024; Datasets Link:\n https://huggingface.co/IR-Cocktail"},{"id":"http://arxiv.org/abs/2407.02211v1","updated":"2024-07-02T12:21:14Z","published":"2024-07-02T12:21:14Z","title":"PromptIntern: Saving Inference Costs by Internalizing Recurrent Prompt\n during Large Language Model Fine-tuning","summary":" Large language models (LLMs) have played a fundamental role in various\nnatural language processing tasks with powerful prompt techniques. However, in\nreal-world applications, there are often similar prompt components for repeated\nqueries, which causes significant computational burdens during inference.\nExisting prompt compression and direct fine-tuning methods aim to tackle these\nchallenges, yet they frequently struggle to strike an optimal balance between\ncost-efficiency and performance effectiveness, especially in complex tasks such\nas NL2Code. In this paper, we propose a novel method namely PromptIntern to\ninternalize the prompt knowledge into model parameters via progressive\nfine-tuning. Our method enables LLMs to emulate the human learning process for\na new task, where detailed templates and examples in a prompt are gradually\ninternalized and phased out progressively as the model grows accustomed to the\ntask. Extensive experiments demonstrate that our method reduces inference\ntokens over 90%, speedups inference by 4.2 times, and saves 88.3% monetary\ncost.\n","authors":["Jiaru Zou","Mengyu Zhou","Tao Li","Shi Han","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17129v2","updated":"2024-07-02T12:18:51Z","published":"2024-05-27T12:47:40Z","title":"TEII: Think, Explain, Interact and Iterate with Large Language Models to\n Solve Cross-lingual Emotion Detection","summary":" Cross-lingual emotion detection allows us to analyze global trends, public\nopinion, and social phenomena at scale. We participated in the Explainability\nof Cross-lingual Emotion Detection (EXALT) shared task, achieving an F1-score\nof 0.6046 on the evaluation set for the emotion detection sub-task. Our system\noutperformed the baseline by more than 0.16 F1-score absolute, and ranked\nsecond amongst competing systems. We conducted experiments using fine-tuning,\nzero-shot learning, and few-shot learning for Large Language Model (LLM)-based\nmodels as well as embedding-based BiLSTM and KNN for non-LLM-based techniques.\nAdditionally, we introduced two novel methods: the Multi-Iteration Agentic\nWorkflow and the Multi-Binary-Classifier Agentic Workflow. We found that\nLLM-based approaches provided good performance on multilingual emotion\ndetection. Furthermore, ensembles combining all our experimented models yielded\nhigher F1-scores than any single approach alone.\n","authors":["Long Cheng","Qihao Shao","Christine Zhao","Sheng Bi","Gina-Anne Levow"],"pdf_url":"https://arxiv.org/pdf/2405.17129v2.pdf","comment":"Proceedings of the 13th Workshop on Computational Approaches to\n Subjectivity, Sentiment, & Social Media Analysis (ACL 2024)"},{"id":"http://arxiv.org/abs/2407.02209v1","updated":"2024-07-02T12:17:07Z","published":"2024-07-02T12:17:07Z","title":"Generative Monoculture in Large Language Models","summary":" We introduce {\\em generative monoculture}, a behavior observed in large\nlanguage models (LLMs) characterized by a significant narrowing of model output\ndiversity relative to available training data for a given task: for example,\ngenerating only positive book reviews for books with a mixed reception. While\nin some cases, generative monoculture enhances performance (e.g., LLMs more\noften produce efficient code), the dangers are exacerbated in others (e.g.,\nLLMs refuse to share diverse opinions). As LLMs are increasingly used in\nhigh-impact settings such as education and web search, careful maintenance of\nLLM output diversity is essential to ensure a variety of facts and perspectives\nare preserved over time. We experimentally demonstrate the prevalence of\ngenerative monoculture through analysis of book review and code generation\ntasks, and find that simple countermeasures such as altering sampling or\nprompting strategies are insufficient to mitigate the behavior. Moreover, our\nresults suggest that the root causes of generative monoculture are likely\nembedded within the LLM's alignment processes, suggesting a need for developing\nfine-tuning paradigms that preserve or promote diversity.\n","authors":["Fan Wu","Emily Black","Varun Chandrasekaran"],"pdf_url":"https://arxiv.org/pdf/2407.02209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02208v1","updated":"2024-07-02T12:15:15Z","published":"2024-07-02T12:15:15Z","title":"How to Learn in a Noisy World? Self-Correcting the Real-World Data Noise\n on Machine Translation","summary":" The massive amounts of web-mined parallel data contain large amounts of\nnoise. Semantic misalignment, as the primary source of the noise, poses a\nchallenge for training machine translation systems. In this paper, we first\nstudy the impact of real-world hard-to-detect misalignment noise by proposing a\nprocess to simulate the realistic misalignment controlled by semantic\nsimilarity. After quantitatively analyzing the impact of simulated misalignment\non machine translation, we show the limited effectiveness of widely used\npre-filters to improve the translation performance, underscoring the necessity\nof more fine-grained ways to handle data noise. By observing the increasing\nreliability of the model's self-knowledge for distinguishing misaligned and\nclean data at the token-level, we propose a self-correction approach which\nleverages the model's prediction distribution to revise the training\nsupervision from the ground-truth data over training time. Through\ncomprehensive experiments, we show that our self-correction method not only\nimproves translation performance in the presence of simulated misalignment\nnoise but also proves effective for real-world noisy web-mined datasets across\neight translation tasks.\n","authors":["Yan Meng","Di Wu","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2407.02208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02203v1","updated":"2024-07-02T12:06:40Z","published":"2024-07-02T12:06:40Z","title":"Automatic Adaptation Rule Optimization via Large Language Models","summary":" Rule-based adaptation is a foundational approach to self-adaptation,\ncharacterized by its human readability and rapid response. However, building\nhigh-performance and robust adaptation rules is often a challenge because it\nessentially involves searching the optimal design in a complex (variables)\nspace. In response, this paper attempt to employ large language models (LLMs)\nas a optimizer to construct and optimize adaptation rules, leveraging the\ncommon sense and reasoning capabilities inherent in LLMs. Preliminary\nexperiments conducted in SWIM have validated the effectiveness and limitation\nof our method.\n","authors":["Yusei Ishimizu","Jialong Li","Jinglue Xu","Jinyu Cai","Hitoshi Iba","Kenji Tei"],"pdf_url":"https://arxiv.org/pdf/2407.02203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13659v2","updated":"2024-07-02T11:57:07Z","published":"2024-02-21T09:45:08Z","title":"Privacy-Preserving Instructions for Aligning Large Language Models","summary":" Service providers of large language model (LLM) applications collect user\ninstructions in the wild and use them in further aligning LLMs with users'\nintentions. These instructions, which potentially contain sensitive\ninformation, are annotated by human workers in the process. This poses a new\nprivacy risk not addressed by the typical private optimization. To this end, we\npropose using synthetic instructions to replace real instructions in data\nannotation and model fine-tuning. Formal differential privacy is guaranteed by\ngenerating those synthetic instructions using privately fine-tuned generators.\nCrucial in achieving the desired utility is our novel filtering algorithm that\nmatches the distribution of the synthetic instructions to that of the real\nones. In both supervised fine-tuning and reinforcement learning from human\nfeedback, our extensive experiments demonstrate the high utility of the final\nset of synthetic instructions by showing comparable results to real\ninstructions. In supervised fine-tuning, models trained with private synthetic\ninstructions outperform leading open-source models such as Vicuna.\n","authors":["Da Yu","Peter Kairouz","Sewoong Oh","Zheng Xu"],"pdf_url":"https://arxiv.org/pdf/2402.13659v2.pdf","comment":"ICML 2024. Code available at\n https://github.com/google-research/google-research/tree/master/dp_instructions"},{"id":"http://arxiv.org/abs/2402.10770v3","updated":"2024-07-02T11:46:09Z","published":"2024-02-16T15:48:33Z","title":"How Reliable Are Automatic Evaluation Methods for Instruction-Tuned\n LLMs?","summary":" Work on instruction-tuned Large Language Models (LLMs) has used automatic\nmethods based on text overlap and LLM judgments as cost-effective alternatives\nto human evaluation. In this paper, we perform a meta-evaluation of such\nmethods and assess their reliability across a broad range of tasks. We observe\nthat while automatic evaluation methods can approximate human ratings under\nspecific conditions, their validity is highly context-dependent. Specifically,\nthe simple ROUGE-L metric correlates well with human ratings for short-answer\nEnglish tasks but is unreliable in free-form generation tasks and cross-lingual\ntransfer. The effectiveness of the more advanced method of using GPT-4 as a\njudge diminishes significantly if reference answers are not included in the\nprompt, which is the scenario where this method has the potential to provide\nthe most value compared to other metrics. Our findings enhance the\nunderstanding of how automatic methods should be applied and interpreted when\ndeveloping and evaluating instruction-tuned LLMs.\n","authors":["Ehsan Doostmohammadi","Oskar Holmström","Marco Kuhlmann"],"pdf_url":"https://arxiv.org/pdf/2402.10770v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09781v3","updated":"2024-07-02T11:24:34Z","published":"2023-12-15T13:33:18Z","title":"GSQA: An End-to-End Model for Generative Spoken Question Answering","summary":" In recent advancements in spoken question answering (QA), end-to-end models\nhave made significant strides. However, previous research has primarily focused\non extractive span selection. While this extractive-based approach is effective\nwhen answers are present directly within the input, it falls short in\naddressing abstractive questions, where answers are not directly extracted but\ninferred from the given information. To bridge this gap, we introduce the first\nend-to-end Generative Spoken Question Answering (GSQA) model that empowers the\nsystem to engage in abstractive reasoning. The challenge in training our GSQA\nmodel lies in the absence of a spoken abstractive QA dataset. We propose using\ntext models for initialization and leveraging the extractive QA dataset to\ntransfer knowledge from the text generative model to the spoken generative\nmodel. Experimental results indicate that our model surpasses the previous\nextractive model by 3% on extractive QA datasets. Furthermore, the GSQA model\nhas only been fine-tuned on the spoken extractive QA dataset. Despite not\nhaving seen any spoken abstractive QA data, it can still closely match the\nperformance of the cascade model. In conclusion, our GSQA model shows the\npotential to generalize to a broad spectrum of questions, thus further\nexpanding the spoken question answering capabilities of abstractive QA. Our\ncode is available at https://voidful.github.io/GSQA\n","authors":["Min-Han Shih","Ho-Lam Chung","Yu-Chi Pai","Ming-Hao Hsu","Guan-Ting Lin","Shang-Wen Li","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2312.09781v3.pdf","comment":"5 pages, 2 figures, Interspeech 2024"},{"id":"http://arxiv.org/abs/2403.03823v5","updated":"2024-07-02T11:22:14Z","published":"2024-03-06T16:10:01Z","title":"A Modular Approach for Multimodal Summarization of TV Shows","summary":" In this paper we address the task of summarizing television shows, which\ntouches key areas in AI research: complex reasoning, multiple modalities, and\nlong narratives. We present a modular approach where separate components\nperform specialized sub-tasks which we argue affords greater flexibility\ncompared to end-to-end methods. Our modules involve detecting scene boundaries,\nreordering scenes so as to minimize the number of cuts between different\nevents, converting visual information to text, summarizing the dialogue in each\nscene, and fusing the scene summaries into a final summary for the entire\nepisode. We also present a new metric, PRISMA (Precision and Recall EvaluatIon\nof Summary FActs), to measure both precision and recall of generated summaries,\nwhich we decompose into atomic facts. Tested on the recently released\nSummScreen3D dataset, our method produces higher quality summaries than\ncomparison models, as measured with ROUGE and our new fact-based metric, and as\nassessed by human evaluators.\n","authors":["Louis Mahon","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2403.03823v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02147v1","updated":"2024-07-02T10:43:49Z","published":"2024-07-02T10:43:49Z","title":"LlamAr & GemmAr: Enhancing LLMs Through Arabic Instruction-Tuning","summary":" Large language models (LLMs) have greatly impacted the natural language\nprocessing (NLP) field, particularly for the English language. These models\nhave demonstrated capabilities in understanding and generating human-like text.\nThe success of language models largely depends on the availability of\nhigh-quality instruction datasets, which consist of detailed task descriptions\nand corresponding responses that are essential for training the models to\naccurately address a variety of prompts. However, the availability and quality\nof these resources vary by language. While models perform well in English, they\noften struggle with languages like Arabic, due to the lack of datasets for\nfine-tuning Arabic-specific tasks. To address this issue, we introduce\nInstAr-500k, a new Arabic instruction dataset created by generating and\ncollecting content that covers several domains and instruction types. We then\nassess this dataset by fine-tuning two open-source models, Llama-3-8B-Instruct\nand Gemma-7B-IT, on several downstream tasks to scale improvements in their\nfunctionality. Based on multiple evaluations, our fine-tuned models achieve\nstate-of-the-art performance on several Arabic NLP benchmarks. These outcomes\nemphasize the effectiveness of our dataset in elevating the capabilities of\nlanguage models for Arabic. Our instruction dataset bridges the performance gap\nbetween English and Arabic language models by providing resources that amplify\nArabic NLP development. Building on this foundation, we developed two\nstate-of-the-art models, LlamAr-8B and GemmAr-7B, which are specifically tuned\nto excel at a wide range of Arabic NLP tasks.\n","authors":["Hasna Chouikhi","Manel Aloui","Cyrine Ben Hammou","Ghaith Chaabane","Haithem Kchaou","Chehir Dhaouadi"],"pdf_url":"https://arxiv.org/pdf/2407.02147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02138v1","updated":"2024-07-02T10:33:31Z","published":"2024-07-02T10:33:31Z","title":"Efficient Nearest Neighbor based Uncertainty Estimation for Natural\n Language Processing Tasks","summary":" Trustworthy prediction in Deep Neural Networks (DNNs), including Pre-trained\nLanguage Models (PLMs) is important for safety-critical applications in the\nreal world. However, DNNs often suffer from uncertainty estimation, such as\nmiscalibration. In particular, approaches that require multiple stochastic\ninference can mitigate this problem, but the expensive cost of inference makes\nthem impractical. In this study, we propose $k$-Nearest Neighbor Uncertainty\nEstimation ($k$NN-UE), which is an uncertainty estimation method that uses the\ndistances from the neighbors and label-existence ratio of neighbors.\nExperiments on sentiment analysis, natural language inference, and named entity\nrecognition show that our proposed method outperforms the baselines or recent\ndensity-based methods in confidence calibration, selective prediction, and\nout-of-distribution detection. Moreover, our analyses indicate that introducing\ndimension reduction or approximate nearest neighbor search inspired by recent\n$k$NN-LM studies reduces the inference overhead without significantly degrading\nestimation performance when combined them appropriately.\n","authors":["Wataru Hashimoto","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.02138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08906v4","updated":"2024-07-02T10:32:47Z","published":"2023-12-14T13:11:35Z","title":"Using eye tracking to investigate what native Chinese speakers notice\n about linguistic landscape images","summary":" Linguistic landscape is an important field in sociolinguistic research. Eye\ntracking technology is a common technology in psychological research. There are\nfew cases of using eye movement to study linguistic landscape. This paper uses\neye tracking technology to study the actual fixation of the linguistic\nlandscape and finds that in the two dimensions of fixation time and fixation\ntimes, the fixation of native Chinese speakers to the linguistic landscape is\nhigher than that of the general landscape. This paper argues that this\nphenomenon is due to the higher information density of linguistic landscapes.\nAt the same time, the article also discusses other possible reasons for this\nphenomenon.\n","authors":["Zichao Wei","Yewei Qin"],"pdf_url":"https://arxiv.org/pdf/2312.08906v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02136v1","updated":"2024-07-02T10:29:09Z","published":"2024-07-02T10:29:09Z","title":"Black Big Boxes: Do Language Models Hide a Theory of Adjective Order?","summary":" In English and other languages, multiple adjectives in a complex noun phrase\nshow intricate ordering patterns that have been a target of much linguistic\ntheory. These patterns offer an opportunity to assess the ability of language\nmodels (LMs) to learn subtle rules of language involving factors that cross the\ntraditional divisions of syntax, semantics, and pragmatics. We review existing\nhypotheses designed to explain Adjective Order Preferences (AOPs) in humans and\ndevelop a setup to study AOPs in LMs: we present a reusable corpus of adjective\npairs and define AOP measures for LMs. With these tools, we study a series of\nLMs across intermediate checkpoints during training. We find that all models'\npredictions are much closer to human AOPs than predictions generated by factors\nidentified in theoretical linguistics. At the same time, we demonstrate that\nthe observed AOPs in LMs are strongly correlated with the frequency of the\nadjective pairs in the training data and report limited generalization to\nunseen combinations. This highlights the difficulty in establishing the link\nbetween LM performance and linguistic theory. We therefore conclude with a road\nmap for future studies our results set the stage for, and a discussion of key\nquestions about the nature of knowledge in LMs and their ability to generalize\nbeyond the training sets.\n","authors":["Jaap Jumelet","Lisa Bylinina","Willem Zuidema","Jakub Szymanik"],"pdf_url":"https://arxiv.org/pdf/2407.02136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02122v1","updated":"2024-07-02T10:12:06Z","published":"2024-07-02T10:12:06Z","title":"Fake News Detection: It's All in the Data!","summary":" This comprehensive survey serves as an indispensable resource for researchers\nembarking on the journey of fake news detection. By highlighting the pivotal\nrole of dataset quality and diversity, it underscores the significance of these\nelements in the effectiveness and robustness of detection models. The survey\nmeticulously outlines the key features of datasets, various labeling systems\nemployed, and prevalent biases that can impact model performance. Additionally,\nit addresses critical ethical issues and best practices, offering a thorough\noverview of the current state of available datasets. Our contribution to this\nfield is further enriched by the provision of GitHub repository, which\nconsolidates publicly accessible datasets into a single, user-friendly portal.\nThis repository is designed to facilitate and stimulate further research and\ndevelopment efforts aimed at combating the pervasive issue of fake news.\n","authors":["Soveatin Kuntur","Anna Wróblewska","Marcin Paprzycki","Maria Ganzha"],"pdf_url":"https://arxiv.org/pdf/2407.02122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02119v1","updated":"2024-07-02T10:09:19Z","published":"2024-07-02T10:09:19Z","title":"Cost-Effective Proxy Reward Model Construction with On-Policy and Active\n Learning","summary":" Reinforcement learning with human feedback (RLHF), as a widely adopted\napproach in current large language model pipelines, is \\textit{bottlenecked by\nthe size of human preference data}. While traditional methods rely on offline\npreference dataset constructions, recent approaches have shifted towards online\nsettings, where a learner uses a small amount of labeled seed data and a large\npool of unlabeled prompts to iteratively construct new preference data through\nself-generated responses and high-quality reward/preference feedback. However,\nmost current online algorithms still focus on preference labeling during policy\nmodel updating with given feedback oracles, which incurs significant expert\nquery costs. \\textit{We are the first to explore cost-effective proxy reward\noracles construction strategies for further labeling preferences or rewards\nwith extremely limited labeled data and expert query budgets}. Our approach\nintroduces two key innovations: (1) on-policy query to avoid OOD and imbalance\nissues in seed data, and (2) active learning to select the most informative\ndata for preference queries. Using these methods, we train a evaluation model\nwith minimal expert-labeled data, which then effectively labels nine times more\npreference pairs for further RLHF training. For instance, our model using\nDirect Preference Optimization (DPO) gains around over 1% average improvement\non AlpacaEval2, MMLU-5shot and MMLU-0shot, with only 1.7K query cost. Our\nmethodology is orthogonal to other direct expert query-based strategies and\ntherefore might be integrated with them to further reduce query costs.\n","authors":["Yifang Chen","Shuohang Wang","Ziyi Yang","Hiteshi Sharma","Nikos Karampatziakis","Donghan Yu","Kevin Jamieson","Simon Shaolei Du","Yelong Shen"],"pdf_url":"https://arxiv.org/pdf/2407.02119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02118v1","updated":"2024-07-02T10:06:41Z","published":"2024-07-02T10:06:41Z","title":"Breaking Language Barriers: Cross-Lingual Continual Pre-Training at\n Scale","summary":" In recent years, Large Language Models (LLMs) have made significant strides\ntowards Artificial General Intelligence. However, training these models from\nscratch requires substantial computational resources and vast amounts of text\ndata. In this paper, we explore an alternative approach to constructing an LLM\nfor a new language by continually pretraining (CPT) from existing pretrained\nLLMs, instead of using randomly initialized parameters. Based on parallel\nexperiments on 40 model sizes ranging from 40M to 5B parameters, we find that\n1) CPT converges faster and saves significant resources in a scalable manner;\n2) CPT adheres to an extended scaling law derived from Hoffmann et al. (2022)\nwith a joint data-parameter scaling term; 3) The compute-optimal data-parameter\nallocation for CPT markedly differs based on our estimated scaling factors; 4)\nThe effectiveness of transfer at scale is influenced by training duration and\nlinguistic properties, while robust to data replaying, a method that\neffectively mitigates catastrophic forgetting in CPT. We hope our findings\nprovide deeper insights into the transferability of LLMs at scale for the\nresearch community.\n","authors":["Wenzhen Zheng","Wenbo Pan","Xu Xu","Libo Qin","Li Yue","Ming Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02118v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2407.02099v1","updated":"2024-07-02T09:36:54Z","published":"2024-07-02T09:36:54Z","title":"Helpful assistant or fruitful facilitator? Investigating how personas\n affect language model behavior","summary":" One way to personalize and steer generations from large language models (LLM)\nis to assign a persona: a role that describes how the user expects the LLM to\nbehave (e.g., a helpful assistant, a teacher, a woman). This paper investigates\nhow personas affect diverse aspects of model behavior. We assign to seven LLMs\n162 personas from 12 categories spanning variables like gender, sexual\norientation, and occupation. We prompt them to answer questions from five\ndatasets covering objective (e.g., questions about math and history) and\nsubjective tasks (e.g., questions about beliefs and values). We also compare\npersona's generations to two baseline settings: a control persona setting with\n30 paraphrases of \"a helpful assistant\" to control for models' prompt\nsensitivity, and an empty persona setting where no persona is assigned. We find\nthat for all models and datasets, personas show greater variability than the\ncontrol setting and that some measures of persona behavior generalize across\nmodels.\n","authors":["Pedro Henrique Luz de Araujo","Benjamin Roth"],"pdf_url":"https://arxiv.org/pdf/2407.02099v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2406.12381v2","updated":"2024-07-02T09:21:03Z","published":"2024-06-18T08:09:58Z","title":"QOG:Question and Options Generation based on Language Model","summary":" Question-Options Generation (QOG) is a task that involves generating a set of\nquestion-options pairs given context. This task has various applications,\nincluding fine-tuning large models, information retrieval, and automated\nmultiple-choice question generation for education. In this paper, we develop\nQOG models using three different methods based on fine-tuning\nsequence-to-sequence language models (LMs). Experiments demonstrate that the\nend-to-end QOG model is computationally efficient and stable during both\ntraining and inference, outperforming other methods. Furthermore, our analysis\nindicates that our QOG models are competitive on the QOG task compared to the\nlarge language model Llama 3-8B.\n","authors":["Jincheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.12381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02067v1","updated":"2024-07-02T08:55:41Z","published":"2024-07-02T08:55:41Z","title":"Crossroads of Continents: Automated Artifact Extraction for Cultural\n Adaptation with Large Multimodal Models","summary":" In this work, we present a comprehensive three-phase study to examine (1) the\neffectiveness of large multimodal models (LMMs) in recognizing cultural\ncontexts; (2) the accuracy of their representations of diverse cultures; and\n(3) their ability to adapt content across cultural boundaries. We first\nintroduce Dalle Street, a large-scale dataset generated by DALL-E 3 and\nvalidated by humans, containing 9,935 images of 67 countries and 10 concept\nclasses. We reveal disparities in cultural understanding at the sub-region\nlevel with both open-weight (LLaVA) and closed-source (GPT-4V) models on Dalle\nStreet and other existing benchmarks. Next, we assess models' deeper culture\nunderstanding by an artifact extraction task and identify over 18,000 artifacts\nassociated with different countries. Finally, we propose a highly composable\npipeline, CultureAdapt, to adapt images from culture to culture. Our findings\nreveal a nuanced picture of the cultural competence of LMMs, highlighting the\nneed to develop culture-aware systems. Dataset and code are available at\nhttps://github.com/iamshnoo/crossroads\n","authors":["Anjishnu Mukherjee","Ziwei Zhu","Antonios Anastasopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.02067v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2407.02066v1","updated":"2024-07-02T08:55:40Z","published":"2024-07-02T08:55:40Z","title":"BiasDora: Exploring Hidden Biased Associations in Vision-Language Models","summary":" Existing works examining Vision Language Models (VLMs) for social biases\npredominantly focus on a limited set of documented bias associations, such as\ngender:profession or race:crime. This narrow scope often overlooks a vast range\nof unexamined implicit associations, restricting the identification and, hence,\nmitigation of such biases. We address this gap by probing VLMs to (1) uncover\nhidden, implicit associations across 9 bias dimensions. We systematically\nexplore diverse input and output modalities and (2) demonstrate how biased\nassociations vary in their negativity, toxicity, and extremity. Our work (3)\nidentifies subtle and extreme biases that are typically not recognized by\nexisting methodologies. We make the Dataset of retrieved associations, (Dora),\npublicly available here https://github.com/chahatraj/BiasDora.\n","authors":["Chahat Raj","Anjishnu Mukherjee","Aylin Caliskan","Antonios Anastasopoulos","Ziwei Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.02066v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.16958v2","updated":"2024-07-02T08:53:09Z","published":"2024-04-25T18:12:43Z","title":"A Closer Look at Classification Evaluation Metrics and a Critical\n Reflection of Common Evaluation Practice","summary":" Classification systems are evaluated in a countless number of papers.\nHowever, we find that evaluation practice is often nebulous. Frequently,\nmetrics are selected without arguments, and blurry terminology invites\nmisconceptions. For instance, many works use so-called 'macro' metrics to rank\nsystems (e.g., 'macro F1') but do not clearly specify what they would expect\nfrom such a `macro' metric. This is problematic, since picking a metric can\naffect research findings, and thus any clarity in the process should be\nmaximized.\n Starting from the intuitive concepts of bias and prevalence, we perform an\nanalysis of common evaluation metrics. The analysis helps us understand the\nmetrics' underlying properties, and how they align with expectations as found\nexpressed in papers. Then we reflect on the practical situation in the field,\nand survey evaluation practice in recent shared tasks. We find that metric\nselection is often not supported with convincing arguments, an issue that can\nmake a system ranking seem arbitrary. Our work aims at providing overview and\nguidance for more informed and transparent metric selection, fostering\nmeaningful evaluation.\n","authors":["Juri Opitz"],"pdf_url":"https://arxiv.org/pdf/2404.16958v2.pdf","comment":"appeared in TACL journal. MIT press publication available at\n https://doi.org/10.1162/tacl_a_00675"},{"id":"http://arxiv.org/abs/2407.02062v1","updated":"2024-07-02T08:49:43Z","published":"2024-07-02T08:49:43Z","title":"Are Data Augmentation Methods in Named Entity Recognition Applicable for\n Uncertainty Estimation?","summary":" This work investigates the impact of data augmentation on confidence\ncalibration and uncertainty estimation in Named Entity Recognition (NER) tasks.\nFor the future advance of NER in safety-critical fields like healthcare and\nfinance, it is essential to achieve accurate predictions with calibrated\nconfidence when applying Deep Neural Networks (DNNs), including Pre-trained\nLanguage Models (PLMs), as a real-world application. However, DNNs are prone to\nmiscalibration, which limits their applicability. Moreover, existing methods\nfor calibration and uncertainty estimation are computational expensive. Our\ninvestigation in NER found that data augmentation improves calibration and\nuncertainty in cross-genre and cross-lingual setting, especially in-domain\nsetting. Furthermore, we showed that the calibration for NER tends to be more\neffective when the perplexity of the sentences generated by data augmentation\nis lower, and that increasing the size of the augmentation further improves\ncalibration and uncertainty.\n","authors":["Wataru Hashimoto","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.02062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01274v2","updated":"2024-07-02T08:47:45Z","published":"2024-07-01T13:29:55Z","title":"Leveraging Large Language Models for Actionable Course Evaluation\n Student Feedback to Lecturers","summary":" End of semester student evaluations of teaching are the dominant mechanism\nfor providing feedback to academics on their teaching practice. For large\nclasses, however, the volume of feedback makes these tools impractical for this\npurpose. This paper explores the use of open-source generative AI to synthesise\nfactual, actionable and appropriate summaries of student feedback from these\nsurvey responses. In our setup, we have 742 student responses ranging over 75\ncourses in a Computer Science department. For each course, we synthesise a\nsummary of the course evaluations and actionable items for the instructor. Our\nresults reveal a promising avenue for enhancing teaching practices in the\nclassroom setting. Our contribution lies in demonstrating the feasibility of\nusing generative AI to produce insightful feedback for teachers, thus providing\na cost-effective means to support educators' development. Overall, our work\nhighlights the possibility of using generative AI to produce factual,\nactionable, and appropriate feedback for teachers in the classroom setting.\n","authors":["Mike Zhang","Euan D Lindsay","Frederik Bode Thorbensen","Danny Bøgsted Poulsen","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2407.01274v2.pdf","comment":"Accepted to SEFI 2024"},{"id":"http://arxiv.org/abs/2407.02056v1","updated":"2024-07-02T08:38:31Z","published":"2024-07-02T08:38:31Z","title":"Integrate the Essence and Eliminate the Dross: Fine-Grained\n Self-Consistency for Free-Form Language Generation","summary":" Self-consistency (SC), leveraging multiple samples from LLMs, shows\nsignificant gains on various reasoning tasks but struggles with free-form\ngeneration due to the difficulty of aggregating answers. Its variants, UCS and\nUSC, rely on sample selection or voting mechanisms to improve output quality.\nThese methods, however, face limitations due to their inability to fully\nutilize the nuanced consensus knowledge present within multiple candidate\nsamples, often resulting in suboptimal outputs. We propose Fine-Grained\nSelf-Consistency (FSC) to addresses these limitations by extracting and\nintegrating segment-level commonalities from candidate samples, enhancing the\nperformance of LLMs both in open-ended and reasoning tasks. Based on this, we\npresent two additional strategies: candidate filtering, which enhances overall\nquality by identifying highly similar candidate sets, and merging, which\nreduces input token requirements by combining similar samples. The\neffectiveness of FSC is demonstrated through extensive experiments on various\ntasks, including summarization, code generation, and mathematical reasoning,\nusing GPT-3.5-turbo and GPT-4. The results indicate significant improvements\nover baseline methods, showcasing the potential of FSC to optimize output\nquality by effectively synthesizing fine-grained consensus knowledge from\nmultiple samples.\n","authors":["Xinglin Wang","Yiwei Li","Shaoxiong Feng","Peiwen Yuan","Boyuan Pan","Heda Wang","Yao Hu","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2407.02056v1.pdf","comment":"Accepted to ACL2024 Main Conference"},{"id":"http://arxiv.org/abs/2407.02049v1","updated":"2024-07-02T08:23:38Z","published":"2024-07-02T08:23:38Z","title":"Accompanied Singing Voice Synthesis with Fully Text-controlled Melody","summary":" Text-to-song (TTSong) is a music generation task that synthesizes accompanied\nsinging voices. Current TTSong methods, inherited from singing voice synthesis\n(SVS), require melody-related information that can sometimes be impractical,\nsuch as music scores or MIDI sequences. We present MelodyLM, the first TTSong\nmodel that generates high-quality song pieces with fully text-controlled\nmelodies, achieving minimal user requirements and maximum control flexibility.\nMelodyLM explicitly models MIDI as the intermediate melody-related feature and\nsequentially generates vocal tracks in a language model manner, conditioned on\ntextual and vocal prompts. The accompaniment music is subsequently synthesized\nby a latent diffusion model with hybrid conditioning for temporal alignment.\nWith minimal requirements, users only need to input lyrics and a reference\nvoice to synthesize a song sample. For full control, just input textual prompts\nor even directly input MIDI. Experimental results indicate that MelodyLM\nachieves superior performance in terms of both objective and subjective\nmetrics. Audio samples are available at https://melodylm666.github.io.\n","authors":["Ruiqi Li","Zhiqing Hong","Yongqi Wang","Lichao Zhang","Rongjie Huang","Siqi Zheng","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.02049v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2407.02043v1","updated":"2024-07-02T08:17:00Z","published":"2024-07-02T08:17:00Z","title":"Concise and Precise Context Compression for Tool-Using Language Models","summary":" Through reading the documentation in the context, tool-using language models\ncan dynamically extend their capability using external tools. The cost is that\nwe have to input lengthy documentation every time the model needs to use the\ntool, occupying the input window as well as slowing down the decoding process.\n Given the progress in general-purpose compression, soft context compression\nis a suitable approach to alleviate the problem. However, when compressing tool\ndocumentation, existing methods suffer from the weaknesses of key information\nloss (specifically, tool/parameter name errors) and difficulty in adjusting the\nlength of compressed sequences based on documentation lengths.\n To address these problems, we propose two strategies for compressing tool\ndocumentation into concise and precise summary sequences for tool-using\nlanguage models. 1) Selective compression strategy mitigates key information\nloss by deliberately retaining key information as raw text tokens. 2) Block\ncompression strategy involves dividing tool documentation into short chunks and\nthen employing a fixed-length compression model to achieve variable-length\ncompression. This strategy facilitates the flexible adjustment of the\ncompression ratio.\n Results on API-Bank and APIBench show that our approach reaches a performance\ncomparable to the upper-bound baseline under up to 16x compression ratio.\n","authors":["Yang Xu","Yunlong Feng","Honglin Mu","Yutai Hou","Yitong Li","Xinghao Wang","Wanjun Zhong","Zhongyang Li","Dandan Tu","Qingfu Zhu","Min Zhang","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2407.02043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02042v1","updated":"2024-07-02T08:16:43Z","published":"2024-07-02T08:16:43Z","title":"Fake News Detection and Manipulation Reasoning via Large Vision-Language\n Models","summary":" Fake news becomes a growing threat to information security and public opinion\nwith the rapid sprawl of media manipulation. Therefore, fake news detection\nattracts widespread attention from academic community. Traditional fake news\ndetection models demonstrate remarkable performance on authenticity binary\nclassification but their ability to reason detailed faked traces based on the\nnews content remains under-explored. Furthermore, due to the lack of external\nknowledge, the performance of existing methods on fact-related news is\nquestionable, leaving their practical implementation unclear. In this paper, we\npropose a new multi-media research topic, namely manipulation reasoning.\nManipulation reasoning aims to reason manipulations based on news content. To\nsupport the research, we introduce a benchmark for fake news detection and\nmanipulation reasoning, referred to as Human-centric and Fact-related Fake News\n(HFFN). The benchmark highlights the centrality of human and the high factual\nrelevance, with detailed manual annotations. HFFN encompasses four realistic\ndomains with fake news samples generated through three manipulation approaches.\nMoreover, a Multi-modal news Detection and Reasoning langUage Model (M-DRUM) is\npresented not only to judge on the authenticity of multi-modal news, but also\nraise analytical reasoning about potential manipulations. On the feature\nextraction level, a cross-attention mechanism is employed to extract\nfine-grained fusion features from multi-modal inputs. On the reasoning level, a\nlarge vision-language model (LVLM) serves as the backbone to facilitate\nfact-related reasoning. A two-stage training framework is deployed to better\nactivate the capacity of identification and reasoning. Comprehensive\nexperiments demonstrate that our model outperforms state-of-the-art (SOTA) fake\nnews detection models and powerful LVLMs like GPT-4 and LLaVA.\n","authors":["Ruihan Jin","Ruibo Fu","Zhengqi Wen","Shuai Zhang","Yukun Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2407.02042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02039v1","updated":"2024-07-02T08:11:18Z","published":"2024-07-02T08:11:18Z","title":"Prompt Stability Scoring for Text Annotation with Large Language Models","summary":" Researchers are increasingly using language models (LMs) for text annotation.\nThese approaches rely only on a prompt telling the model to return a given\noutput according to a set of instructions. The reproducibility of LM outputs\nmay nonetheless be vulnerable to small changes in the prompt design. This calls\ninto question the replicability of classification routines. To tackle this\nproblem, researchers have typically tested a variety of semantically similar\nprompts to determine what we call \"prompt stability.\" These approaches remain\nad-hoc and task specific. In this article, we propose a general framework for\ndiagnosing prompt stability by adapting traditional approaches to intra- and\ninter-coder reliability scoring. We call the resulting metric the Prompt\nStability Score (PSS) and provide a Python package PromptStability for its\nestimation. Using six different datasets and twelve outcomes, we classify >150k\nrows of data to: a) diagnose when prompt stability is low; and b) demonstrate\nthe functionality of the package. We conclude by providing best practice\nrecommendations for applied researchers.\n","authors":["Christopher Barrie","Elli Palaiologou","Petter Törnberg"],"pdf_url":"https://arxiv.org/pdf/2407.02039v1.pdf","comment":"33 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.01441v2","updated":"2024-07-02T08:00:23Z","published":"2024-06-03T15:30:36Z","title":"LexMatcher: Dictionary-centric Data Collection for LLM-based Machine\n Translation","summary":" The fine-tuning of open-source large language models (LLMs) for machine\ntranslation has recently received considerable attention, marking a shift\ntowards data-centric research from traditional neural machine translation.\nHowever, the area of data collection for instruction fine-tuning in machine\ntranslation remains relatively underexplored. In this paper, we present\nLexMatcher, a simple yet effective method for data curation, the design of\nwhich is driven by the coverage of senses found in bilingual dictionaries. The\nconstruction process comprises data retrieval from an existing corpus and data\naugmentation that supplements the infrequent senses of polysemous words.\nUtilizing LLaMA2 as our base model, our approach outperforms the established\nbaselines on the WMT2022 test sets and also exhibits remarkable performance in\ntasks related to word sense disambiguation and specialized terminology\ntranslation. These results underscore the effectiveness of LexMatcher in\nenhancing LLM-based machine translation. The code, data, and models are\navailable at https://github.com/ARIES-LM/Lexmatcher-MT.git.\n","authors":["Yongjing Yin","Jiali Zeng","Yafu Li","Fandong Meng","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.01441v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02990v4","updated":"2024-07-02T07:59:40Z","published":"2024-03-05T14:11:54Z","title":"Data Augmentation using Large Language Models: Data Perspectives,\n Learning Paradigms and Challenges","summary":" In the rapidly evolving field of large language models (LLMs), data\naugmentation (DA) has emerged as a pivotal technique for enhancing model\nperformance by diversifying training examples without the need for additional\ndata collection. This survey explores the transformative impact of LLMs on DA,\nparticularly addressing the unique challenges and opportunities they present in\nthe context of natural language processing (NLP) and beyond. From both data and\nlearning perspectives, we examine various strategies that utilize LLMs for data\naugmentation, including a novel exploration of learning paradigms where\nLLM-generated data is used for diverse forms of further training. Additionally,\nthis paper highlights the primary open challenges faced in this domain, ranging\nfrom controllable data augmentation to multi-modal data augmentation. This\nsurvey highlights a paradigm shift introduced by LLMs in DA, and aims to serve\nas a comprehensive guide for researchers and practitioners.\n","authors":["Bosheng Ding","Chengwei Qin","Ruochen Zhao","Tianze Luo","Xinze Li","Guizhen Chen","Wenhan Xia","Junjie Hu","Anh Tuan Luu","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2403.02990v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02030v1","updated":"2024-07-02T07:58:46Z","published":"2024-07-02T07:58:46Z","title":"Breaking Bias, Building Bridges: Evaluation and Mitigation of Social\n Biases in LLMs via Contact Hypothesis","summary":" Large Language Models (LLMs) perpetuate social biases, reflecting prejudices\nin their training data and reinforcing societal stereotypes and inequalities.\nOur work explores the potential of the Contact Hypothesis, a concept from\nsocial psychology for debiasing LLMs. We simulate various forms of social\ncontact through LLM prompting to measure their influence on the model's biases,\nmirroring how intergroup interactions can reduce prejudices in social contexts.\nWe create a dataset of 108,000 prompts following a principled approach\nreplicating social contact to measure biases in three LLMs (LLaMA 2, Tulu, and\nNousHermes) across 13 social bias dimensions. We propose a unique debiasing\ntechnique, Social Contact Debiasing (SCD), that instruction-tunes these models\nwith unbiased responses to prompts. Our research demonstrates that LLM\nresponses exhibit social biases when subject to contact probing, but more\nimportantly, these biases can be significantly reduced by up to 40% in 1 epoch\nof instruction tuning LLaMA 2 following our SCD strategy. Our code and data are\navailable at https://github.com/chahatraj/breakingbias.\n","authors":["Chahat Raj","Anjishnu Mukherjee","Aylin Caliskan","Antonios Anastasopoulos","Ziwei Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.02030v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2407.01272v2","updated":"2024-07-02T07:55:24Z","published":"2024-07-01T13:25:33Z","title":"Show Less, Instruct More: Enriching Prompts with Definitions and\n Guidelines for Zero-Shot NER","summary":" Recently, several specialized instruction-tuned Large Language Models (LLMs)\nfor Named Entity Recognition (NER) have emerged. Compared to traditional NER\napproaches, these models have strong generalization capabilities. Existing LLMs\nmainly focus on zero-shot NER in out-of-domain distributions, being fine-tuned\non an extensive number of entity classes that often highly or completely\noverlap with test sets. In this work instead, we propose SLIMER, an approach\ndesigned to tackle never-seen-before named entity tags by instructing the model\non fewer examples, and by leveraging a prompt enriched with definition and\nguidelines. Experiments demonstrate that definition and guidelines yield better\nperformance, faster and more robust learning, particularly when labelling\nunseen Named Entities. Furthermore, SLIMER performs comparably to\nstate-of-the-art approaches in out-of-domain zero-shot NER, while being trained\non a reduced tag set.\n","authors":["Andrew Zamai","Andrea Zugarini","Leonardo Rigutini","Marco Ernandes","Marco Maggini"],"pdf_url":"https://arxiv.org/pdf/2407.01272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02028v1","updated":"2024-07-02T07:52:30Z","published":"2024-07-02T07:52:30Z","title":"Why does in-context learning fail sometimes? Evaluating in-context\n learning on open and closed questions","summary":" We measure the performance of in-context learning as a function of task\nnovelty and difficulty for open and closed questions. For that purpose, we\ncreated a novel benchmark consisting of hard scientific questions, each paired\nwith a context of various relevancy. We show that counter-intuitively, a\ncontext that is more aligned with the topic does not always help more than a\nless relevant context. This effect is especially visible for open questions and\nquestions of high difficulty or novelty. This result reveals a fundamental\ndifference between the treatment of close-form and open-form questions by\nlarge-language models and shows a need for a more robust evaluation of\nin-context learning on the variety of different types of questions. It also\nposes a new question of how to optimally select a context for large language\nmodels, especially in the context of Retrieval Augmented Generation (RAG)\nsystems. Our results suggest that the answer to this question can be highly\napplication-dependent and might be contingent on factors including the format\nof the question, the perceived difficulty level of the questions, and the\nnovelty or popularity of the information we seek.\n","authors":["Xiang Li","Haoran Tang","Siyu Chen","Ziwei Wang","Ryan Chen","Marcin Abram"],"pdf_url":"https://arxiv.org/pdf/2407.02028v1.pdf","comment":"8 pages plus references, 4 main figures, 6 pages of supplementary\n material"},{"id":"http://arxiv.org/abs/2407.01525v2","updated":"2024-07-02T07:37:56Z","published":"2024-07-01T17:59:35Z","title":"Empowering 3D Visual Grounding with Reasoning Capabilities","summary":" Although great progress has been made in 3D visual grounding, current models\nstill rely on explicit textual descriptions for grounding and lack the ability\nto reason human intentions from implicit instructions. We propose a new task\ncalled 3D reasoning grounding and introduce a new benchmark ScanReason which\nprovides over 10K question-answer-location pairs from five reasoning types that\nrequire the synerization of reasoning and grounding. We further design our\napproach, ReGround3D, composed of the visual-centric reasoning module empowered\nby Multi-modal Large Language Model (MLLM) and the 3D grounding module to\nobtain accurate object locations by looking back to the enhanced geometry and\nfine-grained details from the 3D scenes. A chain-of-grounding mechanism is\nproposed to further boost the performance with interleaved reasoning and\ngrounding steps during inference. Extensive experiments on the proposed\nbenchmark validate the effectiveness of our proposed approach.\n","authors":["Chenming Zhu","Tai Wang","Wenwei Zhang","Kai Chen","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2407.01525v2.pdf","comment":"Accepted by ECCV 2024. A comprehensive and hierarchical 3D reasoning\n grounding benchmark in the era of foundation models. Project page:\n https://zcmax.github.io/projects/ScanReason"},{"id":"http://arxiv.org/abs/2405.10020v2","updated":"2024-07-02T07:29:04Z","published":"2024-05-16T12:02:02Z","title":"Natural Language Can Help Bridge the Sim2Real Gap","summary":" The main challenge in learning image-conditioned robotic policies is\nacquiring a visual representation conducive to low-level control. Due to the\nhigh dimensionality of the image space, learning a good visual representation\nrequires a considerable amount of visual data. However, when learning in the\nreal world, data is expensive. Sim2Real is a promising paradigm for overcoming\ndata scarcity in the real-world target domain by using a simulator to collect\nlarge amounts of cheap data closely related to the target task. However, it is\ndifficult to transfer an image-conditioned policy from sim to real when the\ndomains are very visually dissimilar. To bridge the sim2real visual gap, we\npropose using natural language descriptions of images as a unifying signal\nacross domains that captures the underlying task-relevant semantics. Our key\ninsight is that if two image observations from different domains are labeled\nwith similar language, the policy should predict similar action distributions\nfor both images. We demonstrate that training the image encoder to predict the\nlanguage description or the distance between descriptions of a sim or real\nimage serves as a useful, data-efficient pretraining step that helps learn a\ndomain-invariant image representation. We can then use this image encoder as\nthe backbone of an IL policy trained simultaneously on a large amount of\nsimulated and a handful of real demonstrations. Our approach outperforms widely\nused prior sim2real methods and strong vision-language pretraining baselines\nlike CLIP and R3M by 25 to 40%. See additional videos and materials at\nhttps://robin-lab.cs.utexas.edu/lang4sim2real/.\n","authors":["Albert Yu","Adeline Foote","Raymond Mooney","Roberto Martín-Martín"],"pdf_url":"https://arxiv.org/pdf/2405.10020v2.pdf","comment":"To appear in RSS 2024. Project website at\n https://robin-lab.cs.utexas.edu/lang4sim2real/"},{"id":"http://arxiv.org/abs/2407.02005v1","updated":"2024-07-02T07:22:57Z","published":"2024-07-02T07:22:57Z","title":"An End-to-End Speech Summarization Using Large Language Model","summary":" Abstractive Speech Summarization (SSum) aims to generate human-like text\nsummaries from spoken content. It encounters difficulties in handling long\nspeech input and capturing the intricate cross-modal mapping between long\nspeech inputs and short text summaries. Research on large language models\n(LLMs) and multimodal information fusion has provided new insights for\naddressing these challenges. In this paper, we propose an end-to-end SSum model\nthat utilizes Q-Former as a connector for the audio-text modality and employs\nLLMs to generate text summaries directly from speech features. We adopt a\nmulti-stage training approach that includes LLM based ASR and Text\nSummarization (TSum) tasks as auxiliary tasks. ASR tasks are used to align\nfeature spaces and enhance the LLM's ability to handle longer speech. Then, we\nutilize a curriculum learning strategy to facilitate the model's transition\nfrom TSum to SSum. Finally, our model achieves competitive performance on the\nHow-2 dataset.\n","authors":["Hengchao Shang","Zongyao Li","Jiaxin Guo","Shaojun Li","Zhiqiang Rao","Yuanchang Luo","Daimeng Wei","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02005v1.pdf","comment":"InterSpeech 2024"},{"id":"http://arxiv.org/abs/2407.01994v1","updated":"2024-07-02T07:07:59Z","published":"2024-07-02T07:07:59Z","title":"Simple Augmentations of Logical Rules for Neuro-Symbolic Knowledge Graph\n Completion","summary":" High-quality and high-coverage rule sets are imperative to the success of\nNeuro-Symbolic Knowledge Graph Completion (NS-KGC) models, because they form\nthe basis of all symbolic inferences. Recent literature builds neural models\nfor generating rule sets, however, preliminary experiments show that they\nstruggle with maintaining high coverage. In this work, we suggest three simple\naugmentations to existing rule sets: (1) transforming rules to their abductive\nforms, (2) generating equivalent rules that use inverse forms of constituent\nrelations and (3) random walks that propose new rules. Finally, we prune\npotentially low quality rules. Experiments over four datasets and five\nruleset-baseline settings suggest that these simple augmentations consistently\nimprove results, and obtain up to 7.1 pt MRR and 8.5 pt Hits@1 gains over using\nrules without augmentations.\n","authors":["Ananjan Nandi","Navdeep Kaur","Parag Singla"," Mausam"],"pdf_url":"https://arxiv.org/pdf/2407.01994v1.pdf","comment":"12 pages, 15 tables Published in ACL 2023"},{"id":"http://arxiv.org/abs/2407.01992v1","updated":"2024-07-02T07:06:53Z","published":"2024-07-02T07:06:53Z","title":"Is Your Large Language Model Knowledgeable or a Choices-Only Cheater?","summary":" Recent work shows that large language models (LLMs) can answer\nmultiple-choice questions using only the choices, but does this mean that MCQA\nleaderboard rankings of LLMs are largely influenced by abilities in\nchoices-only settings? To answer this, we use a contrast set that probes if\nLLMs over-rely on choices-only shortcuts in MCQA. While previous works build\ncontrast sets via expensive human annotations or model-generated data which can\nbe biased, we employ graph mining to extract contrast sets from existing MCQA\ndatasets. We use our method on UnifiedQA, a group of six commonsense reasoning\ndatasets with high choices-only accuracy, to build an 820-question contrast\nset. After validating our contrast set, we test 12 LLMs, finding that these\nmodels do not exhibit reliance on choice-only shortcuts when given both the\nquestion and choices. Thus, despite the susceptibility~of MCQA to high\nchoices-only accuracy, we argue that LLMs are not obtaining high ranks on MCQA\nleaderboards just due to their ability to exploit choices-only shortcuts.\n","authors":["Nishant Balepur","Rachel Rudinger"],"pdf_url":"https://arxiv.org/pdf/2407.01992v1.pdf","comment":"KnowledgeLM Workshop @ ACL 2024"},{"id":"http://arxiv.org/abs/2311.03780v2","updated":"2024-07-02T06:54:56Z","published":"2023-11-07T07:53:06Z","title":"DynaSemble: Dynamic Ensembling of Textual and Structure-Based Models for\n Knowledge Graph Completion","summary":" We consider two popular approaches to Knowledge Graph Completion (KGC):\ntextual models that rely on textual entity descriptions, and structure-based\nmodels that exploit the connectivity structure of the Knowledge Graph (KG).\nPreliminary experiments show that these approaches have complementary\nstrengths: structure-based models perform exceptionally well when the gold\nanswer is easily reachable from the query head in the KG, while textual models\nexploit descriptions to give good performance even when the gold answer is not\neasily reachable. In response, we propose DynaSemble, a novel method for\nlearning query-dependent ensemble weights to combine these approaches by using\nthe distributions of scores assigned by the models in the ensemble to all\ncandidate entities. DynaSemble achieves state-of-the-art results on three\nstandard KGC datasets, with up to 6.8 pt MRR and 8.3 pt Hits@1 gains over the\nbest baseline model for the WN18RR dataset.\n","authors":["Ananjan Nandi","Navdeep Kaur","Parag Singla"," Mausam"],"pdf_url":"https://arxiv.org/pdf/2311.03780v2.pdf","comment":"12 pages, 2 figures, 15 tables Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2402.16029v4","updated":"2024-07-02T06:40:30Z","published":"2024-02-25T08:41:32Z","title":"GraphWiz: An Instruction-Following Language Model for Graph Problems","summary":" Large language models (LLMs) have achieved impressive success across several\nfields, but their proficiency in understanding and resolving complex graph\nproblems is less explored. To bridge this gap, we introduce GraphInstruct, a\nnovel and comprehensive instruction-tuning dataset designed to equip language\nmodels with the ability to tackle a broad spectrum of graph problems using\nexplicit reasoning paths. Utilizing GraphInstruct, we build GraphWiz, an\nopen-source language model capable of resolving various graph problem types\nwhile generating clear reasoning processes. To enhance the model's capability\nand reliability, we incorporate the Direct Preference Optimization (DPO)\nframework into the graph problem-solving context. The enhanced model,\nGraphWiz-DPO, achieves an average accuracy of 65% across nine tasks with\ndifferent complexity levels, surpassing GPT-4 which has an average accuracy of\n43.8%. Moreover, our research delves into the delicate balance between training\ndata volume and model performance, highlighting the potential for overfitting\nwith increased data. We also explore the transferability of the model's\nreasoning ability across different graph tasks, indicating the model's\nadaptability and practical application potential. Our investigation offers a\nnew blueprint and valuable insights for developing LLMs specialized in graph\nreasoning and problem-solving.\n","authors":["Nuo Chen","Yuhan Li","Jianheng Tang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2402.16029v4.pdf","comment":"27pages, 15 tables"},{"id":"http://arxiv.org/abs/2407.01976v1","updated":"2024-07-02T06:29:05Z","published":"2024-07-02T06:29:05Z","title":"A Bounding Box is Worth One Token: Interleaving Layout and Text in a\n Large Language Model for Document Understanding","summary":" Recently, many studies have demonstrated that exclusively incorporating\nOCR-derived text and spatial layouts with large language models (LLMs) can be\nhighly effective for document understanding tasks. However, existing methods\nthat integrate spatial layouts with text have limitations, such as producing\noverly long text sequences or failing to fully leverage the autoregressive\ntraits of LLMs. In this work, we introduce Interleaving Layout and Text in a\nLarge Language Model (LayTextLLM)} for document understanding. In particular,\nLayTextLLM projects each bounding box to a single embedding and interleaves it\nwith text, efficiently avoiding long sequence issues while leveraging\nautoregressive traits of LLMs. LayTextLLM not only streamlines the interaction\nof layout and textual data but also shows enhanced performance in Key\nInformation Extraction (KIE) and Visual Question Answering (VQA). Comprehensive\nbenchmark evaluations reveal significant improvements, with a 27.0% increase on\nKIE tasks and 24.1% on VQA tasks compared to previous state-of-the-art document\nunderstanding MLLMs, as well as a 15.5% improvement over other SOTA OCR-based\nLLMs on KIE tasks.\n","authors":["Jinghui Lu","Haiyang Yu","Yanjie Wang","Yongjie Ye","Jingqun Tang","Ziwei Yang","Binghong Wu","Qi Liu","Hao Feng","Han Wang","Hao Liu","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2407.01976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01411v2","updated":"2024-07-02T06:21:41Z","published":"2024-07-01T16:00:53Z","title":"HyperLoader: Integrating Hypernetwork-Based LoRA and Adapter Layers into\n Multi-Task Transformers for Sequence Labelling","summary":" We present HyperLoader, a simple approach that combines different\nparameter-efficient fine-tuning methods in a multi-task setting. To achieve\nthis goal, our model uses a hypernetwork to generate the weights of these\nmodules based on the task, the transformer layer, and its position within this\nlayer. Our method combines the benefits of multi-task learning by capturing the\nstructure of all tasks while reducing the task interference problem by\nencapsulating the task-specific knowledge in the generated weights and the\nbenefits of combining different parameter-efficient methods to outperform\nfull-fine tuning. We provide empirical evidence that HyperLoader outperforms\nprevious approaches in most datasets and obtains the best average performance\nacross tasks in high-resource and low-resource scenarios.\n","authors":["Jesus-German Ortiz-Barajas","Helena Gomez-Adorno","Thamar Solorio"],"pdf_url":"https://arxiv.org/pdf/2407.01411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01965v1","updated":"2024-07-02T05:50:16Z","published":"2024-07-02T05:50:16Z","title":"AdaCQR: Enhancing Query Reformulation for Conversational Search via\n Sparse and Dense Retrieval Alignment","summary":" Conversational Query Reformulation (CQR) has significantly advanced in\naddressing the challenges of conversational search, particularly those stemming\nfrom the latent user intent and the need for historical context. Recent works\naimed to boost the performance of CRQ through alignment. However, they are\ndesigned for one specific retrieval system, which potentially results in poor\ngeneralization. To overcome this limitation, we present a novel framework\nAdaCQR. By aligning reformulation models with both term-based and\nsemantic-based retrieval systems, AdaCQR enhances the generalizability of\ninformation-seeking queries across diverse retrieval environments through a\ndual-phase training strategy. We also developed two effective approaches for\nacquiring superior labels and diverse input candidates, boosting the efficiency\nand robustness of the framework. Experimental evaluations on the TopiOCQA and\nQReCC datasets demonstrate that AdaCQR significantly outperforms existing\nmethods, offering both quantitative and qualitative improvements in\nconversational query reformulation.\n","authors":["Yilong Lai","Jialong Wu","Congzhi Zhang","Haowen Sun","Deyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.01965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01964v1","updated":"2024-07-02T05:43:15Z","published":"2024-07-02T05:43:15Z","title":"Enabling Discriminative Reasoning in Large Language Models for Legal\n Judgment Prediction","summary":" Legal judgment prediction is essential for enhancing judicial efficiency. In\nthis work, we identify that existing large language models (LLMs) underperform\nin this domain due to challenges in understanding case complexities and\ndistinguishing between similar charges. To adapt LLMs for effective legal\njudgment prediction, we introduce the Ask-Discriminate-Predict (ADAPT)\nreasoning framework inspired by human judicial reasoning. ADAPT involves\ndecomposing case facts, discriminating among potential charges, and predicting\nthe final judgment. We further enhance LLMs through fine-tuning with multi-task\nsynthetic trajectories to improve legal judgment prediction accuracy and\nefficiency under our ADAPT framework. Extensive experiments conducted on two\nwidely-used datasets demonstrate the superior performance of our framework in\nlegal judgment prediction, particularly when dealing with complex and confusing\ncharges.\n","authors":["Chenlong Deng","Kelong Mao","Yuyao Zhang","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2407.01964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01955v1","updated":"2024-07-02T05:14:15Z","published":"2024-07-02T05:14:15Z","title":"S2D: Sorted Speculative Decoding For More Efficient Deployment of Nested\n Large Language Models","summary":" Deployment of autoregressive large language models (LLMs) is costly, and as\nthese models increase in size, the associated costs will become even more\nconsiderable. Consequently, different methods have been proposed to accelerate\nthe token generation process and reduce costs. Speculative decoding (SD) is\namong the most promising approaches to speed up the LLM decoding process by\nverifying multiple tokens in parallel and using an auxiliary smaller draft\nmodel to generate the possible tokens. In SD, usually, one draft model is used\nto serve a specific target model; however, in practice, LLMs are diverse, and\nwe might need to deal with many target models or more than one target model\nsimultaneously. In this scenario, it is not clear which draft model should be\nused for which target model, and searching among different draft models or\ntraining customized draft models can further increase deployment costs. In this\npaper, we first introduce a novel multi-target scenario for the deployment of\ndraft models for faster inference. Then, we present a novel, more efficient\nsorted speculative decoding mechanism that outperforms regular baselines in\nmulti-target settings. We evaluated our method on Spec-Bench in different\nsettings, including base models such as Vicuna 7B, 13B, and LLama Chat 70B. Our\nresults suggest that our draft models perform better than baselines for\nmultiple target models at the same time.\n","authors":["Parsa Kavehzadeh","Mohammadreza Pourreza","Mojtaba Valipour","Tinashu Zhu","Haoli Bai","Ali Ghodsi","Boxing Chen","Mehdi Rezagholizadeh"],"pdf_url":"https://arxiv.org/pdf/2407.01955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17272v3","updated":"2024-07-02T05:02:02Z","published":"2023-09-29T14:23:26Z","title":"Enhancing Large Language Models in Coding Through Multi-Perspective\n Self-Consistency","summary":" Large language models (LLMs) have exhibited remarkable ability in code\ngeneration. However, generating the correct solution in a single attempt still\nremains a challenge. Prior works utilize verification properties in software\nengineering to verify and re-rank solutions in a majority voting manner. But\nthe assumption behind them that generated verification properties have better\nqualities than solutions may not always hold. In this paper, we treat them\nequally as different perspectives of LLMs' reasoning processes. We propose the\nMulti-Perspective Self-Consistency (MPSC) framework incorporating both inter-\nand intra-consistency across outputs from multiple perspectives. Specifically,\nwe prompt LLMs to generate diverse outputs from three perspectives, Solution,\nSpecification and Test case, constructing a 3-partite graph. With two measure\nfunctions of consistency, we embed both inter- and intra-consistency\ninformation into the graph. The optimal choice of solutions is then determined\nbased on analysis in the graph. MPSC significantly boosts performance of\nfoundation models (ChatGPT in this paper) on various benchmarks, including\nHumanEval (+15.91%), MBPP (+6.43%) and CodeContests (+9.37%), even surpassing\nGPT-4.\n","authors":["Baizhou Huang","Shuai Lu","Weizhu Chen","Xiaojun Wan","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2309.17272v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2403.00092v2","updated":"2024-07-02T04:50:36Z","published":"2024-02-29T19:40:25Z","title":"PROC2PDDL: Open-Domain Planning Representations from Texts","summary":" Planning in a text-based environment continues to be a major challenge for AI\nsystems. Recent approaches have used language models to predict a planning\ndomain definition (e.g., PDDL) but have only been evaluated in closed-domain\nsimulated environments. To address this, we present Proc2PDDL , the first\ndataset containing open-domain procedural texts paired with expert-annotated\nPDDL representations. Using this dataset, we evaluate state-of-the-art models\non defining the preconditions and effects of actions. We show that Proc2PDDL is\nhighly challenging, with GPT-3.5's success rate close to 0% and GPT-4's around\n35%. Our analysis shows both syntactic and semantic errors, indicating LMs'\ndeficiency in both generating domain-specific prgorams and reasoning about\nevents. We hope this analysis and dataset helps future progress towards\nintegrating the best of LMs and formal planning.\n","authors":["Tianyi Zhang","Li Zhang","Zhaoyi Hou","Ziyu Wang","Yuling Gu","Peter Clark","Chris Callison-Burch","Niket Tandon"],"pdf_url":"https://arxiv.org/pdf/2403.00092v2.pdf","comment":"In NLRSE 2024, the 2nd Natural Language Reasoning and Structured\n Explanations Workshop"},{"id":"http://arxiv.org/abs/2407.00782v2","updated":"2024-07-02T04:46:35Z","published":"2024-06-30T17:59:07Z","title":"Step-Controlled DPO: Leveraging Stepwise Error for Enhanced Mathematical\n Reasoning","summary":" Direct Preference Optimization (DPO) has proven effective at improving the\nperformance of large language models (LLMs) on downstream tasks such as\nreasoning and alignment. In this work, we propose Step-Controlled DPO (SCDPO),\na method for automatically providing stepwise error supervision by creating\nnegative samples of mathematical reasoning rationales that start making errors\nat a specified step. By applying these samples in DPO training, SCDPO can\nbetter align the model to understand reasoning errors and output accurate\nreasoning steps. We apply SCDPO to both code-integrated and chain-of-thought\nsolutions, empirically showing that it consistently improves the performance\ncompared to naive DPO on three different SFT models, including one existing SFT\nmodel and two models we finetuned. Qualitative analysis of the credit\nassignment of SCDPO and DPO demonstrates the effectiveness of SCDPO at\nidentifying errors in mathematical solutions. We then apply SCDPO to an\nInternLM2-20B model, resulting in a 20B model that achieves high scores of\n88.5% on GSM8K and 58.1% on MATH, rivaling all other open-source LLMs, showing\nthe great potential of our method.\n","authors":["Zimu Lu","Aojun Zhou","Ke Wang","Houxing Ren","Weikang Shi","Junting Pan","Mingjie Zhan","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2407.00782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01948v1","updated":"2024-07-02T04:39:19Z","published":"2024-07-02T04:39:19Z","title":"Extracting and Encoding: Leveraging Large Language Models and Medical\n Knowledge to Enhance Radiological Text Representation","summary":" Advancing representation learning in specialized fields like medicine remains\nchallenging due to the scarcity of expert annotations for text and images. To\ntackle this issue, we present a novel two-stage framework designed to extract\nhigh-quality factual statements from free-text radiology reports in order to\nimprove the representations of text encoders and, consequently, their\nperformance on various downstream tasks. In the first stage, we propose a\n\\textit{Fact Extractor} that leverages large language models (LLMs) to identify\nfactual statements from well-curated domain-specific datasets. In the second\nstage, we introduce a \\textit{Fact Encoder} (CXRFE) based on a BERT model\nfine-tuned with objective functions designed to improve its representations\nusing the extracted factual data. Our framework also includes a new\nembedding-based metric (CXRFEScore) for evaluating chest X-ray text generation\nsystems, leveraging both stages of our approach. Extensive evaluations show\nthat our fact extractor and encoder outperform current state-of-the-art methods\nin tasks such as sentence ranking, natural language inference, and label\nextraction from radiology reports. Additionally, our metric proves to be more\nrobust and effective than existing metrics commonly used in the radiology\nreport generation literature. The code of this project is available at\n\\url{https://github.com/PabloMessina/CXR-Fact-Encoder}.\n","authors":["Pablo Messina","René Vidal","Denis Parra","Álvaro Soto","Vladimir Araujo"],"pdf_url":"https://arxiv.org/pdf/2407.01948v1.pdf","comment":"Accepted to ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2407.01942v1","updated":"2024-07-02T04:23:54Z","published":"2024-07-02T04:23:54Z","title":"Certainly Uncertain: A Benchmark and Metric for Multimodal Epistemic and\n Aleatoric Awareness","summary":" The ability to acknowledge the inevitable uncertainty in their knowledge and\nreasoning is a prerequisite for AI systems to be truly truthful and reliable.\nIn this paper, we present a taxonomy of uncertainty specific to vision-language\nAI systems, distinguishing between epistemic uncertainty (arising from a lack\nof information) and aleatoric uncertainty (due to inherent unpredictability),\nand further explore finer categories within. Based on this taxonomy, we\nsynthesize a benchmark dataset, CertainlyUncertain, featuring 178K visual\nquestion answering (VQA) samples as contrastive pairs. This is achieved by 1)\ninpainting images to make previously answerable questions into unanswerable\nones; and 2) using image captions to prompt large language models for both\nanswerable and unanswerable questions. Additionally, we introduce a new metric\nconfidence-weighted accuracy, that is well correlated with both accuracy and\ncalibration error, to address the shortcomings of existing metrics.\n","authors":["Khyathi Raghavi Chandu","Linjie Li","Anas Awadalla","Ximing Lu","Jae Sung Park","Jack Hessel","Lijuan Wang","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2407.01942v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2407.01937v1","updated":"2024-07-02T04:11:52Z","published":"2024-07-02T04:11:52Z","title":"Efficient-Empathy: Towards Efficient and Effective Selection of Empathy\n Data","summary":" In recent years, with the rapid advancements in large language models (LLMs),\nachieving excellent empathetic response capability has become a crucial\nprerequisite. Consequently, managing and understanding large-scale video\ndatasets has gained increasing importance. However, empathetic data are\ntypically trained without any quality selection, leading to inefficient data\nusage and wasted computational resources. Additionally, using raw data can\nresult in low performance in empathetic dialogues. In this work, we present\nEfficient-Empathy, a sensibility and rationality score-based data selection\nalgorithm that automatically selects sensibility and rationality data while\ndiscarding low-quality data. With only the sensibility data (59% of the full\ndataset), our trained sensibility model efficiently achieves state-of-the-art\n(SoTA) performance. Furthermore, with multiple data selection hyperparameters,\nthe sensibility model demonstrates SoTA performance, showcasing the robustness\nof our method. By integrating sensibility and rationality data with a MoE\nstructure, we achieve even higher performance, demonstrating the effectiveness\nof our Efficient-Empathy algorithm.\n","authors":["Linzhuang Sun","Hao Liang","Jingxuan Wei","Linkun Sun","Bihui Yu","Bin Cui","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15997v2","updated":"2024-07-02T04:07:14Z","published":"2023-12-26T11:01:36Z","title":"Aligning Large Language Models with Human Preferences through\n Representation Engineering","summary":" Aligning large language models (LLMs) with human preferences is crucial for\nenhancing their utility in terms of helpfulness, truthfulness, safety,\nharmlessness, and interestingness. Existing methods for achieving this\nalignment often involves employing reinforcement learning from human feedback\n(RLHF) to fine-tune LLMs based on human labels assessing the relative quality\nof model responses. Nevertheless, RLHF is susceptible to instability during\nfine-tuning and presents challenges in implementation.Drawing inspiration from\nthe emerging field of representation engineering (RepE), this study aims to\nidentify relevant representations for high-level human preferences embedded in\npatterns of activity within an LLM, and achieve precise control of model\nbehavior by transforming its representations. This novel approach, denoted as\nRepresentation Alignment from Human Feedback (RAHF), proves to be effective,\ncomputationally efficient, and easy to implement.Extensive experiments\ndemonstrate the efficacy of RAHF in not only capturing but also manipulating\nrepresentations to align with a broad spectrum of human preferences or values,\nrather than being confined to a singular concept or function (e.g. honesty or\nbias). RAHF's versatility in accommodating diverse human preferences shows its\npotential for advancing LLM performance.\n","authors":["Wenhao Liu","Xiaohua Wang","Muling Wu","Tianlong Li","Changze Lv","Zixuan Ling","Jianhao Zhu","Cenyuan Zhang","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2312.15997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19255v2","updated":"2024-07-02T03:46:03Z","published":"2024-02-29T15:26:14Z","title":"GSM-Plus: A Comprehensive Benchmark for Evaluating the Robustness of\n LLMs as Mathematical Problem Solvers","summary":" Large language models (LLMs) have achieved impressive performance across\nvarious mathematical reasoning benchmarks. However, there are increasing\ndebates regarding whether these models truly understand and apply mathematical\nknowledge or merely rely on shortcuts for mathematical reasoning. One essential\nand frequently occurring evidence is that when the math questions are slightly\nchanged, LLMs can behave incorrectly. This motivates us to evaluate the\nrobustness of LLMs' math reasoning capability by testing a wide range of\nquestion variations. We introduce the adversarial grade school math (GSM-Plus)\ndataset, an extension of GSM8K augmented with various mathematical\nperturbations. Our experiments on 25 LLMs and 4 prompting techniques show that\nwhile LLMs exhibit different levels of math reasoning abilities, their\nperformances are far from robust. In particular, even for problems that have\nbeen solved in GSM8K, LLMs can make mistakes when new statements are added or\nthe question targets are altered. We also explore whether more robust\nperformance can be achieved by composing existing prompting methods, in which\nwe try an iterative method that generates and verifies each intermediate\nthought based on its reasoning goal and calculation result.\n","authors":["Qintong Li","Leyang Cui","Xueliang Zhao","Lingpeng Kong","Wei Bi"],"pdf_url":"https://arxiv.org/pdf/2402.19255v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.01929v1","updated":"2024-07-02T03:45:55Z","published":"2024-07-02T03:45:55Z","title":"What We Talk About When We Talk About LMs: Implicit Paradigm Shifts and\n the Ship of Language Models","summary":" The term Language Models (LMs), as a time-specific collection of models of\ninterest, is constantly reinvented, with its referents updated much like the\n$\\textit{Ship of Theseus}$ replaces its parts but remains the same ship in\nessence. In this paper, we investigate this $\\textit{Ship of Language Models}$\nproblem, wherein scientific evolution takes the form of continuous, implicit\nretrofits of key existing terms. We seek to initiate a novel perspective of\nscientific progress, in addition to the more well-studied emergence of new\nterms. To this end, we construct the data infrastructure based on recent NLP\npublications. Then, we perform a series of text-based analyses toward a\ndetailed, quantitative understanding of the use of Language Models as a term of\nart. Our work highlights how systems and theories influence each other in\nscientific discourse, and we call for attention to the transformation of this\nShip that we all are contributing to.\n","authors":["Shengqi Zhu","Jeffrey M. Rzeszotarski"],"pdf_url":"https://arxiv.org/pdf/2407.01929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12636v3","updated":"2024-07-02T03:42:36Z","published":"2024-02-20T01:28:34Z","title":"StyleDubber: Towards Multi-Scale Style Learning for Movie Dubbing","summary":" Given a script, the challenge in Movie Dubbing (Visual Voice Cloning, V2C) is\nto generate speech that aligns well with the video in both time and emotion,\nbased on the tone of a reference audio track. Existing state-of-the-art V2C\nmodels break the phonemes in the script according to the divisions between\nvideo frames, which solves the temporal alignment problem but leads to\nincomplete phoneme pronunciation and poor identity stability. To address this\nproblem, we propose StyleDubber, which switches dubbing learning from the frame\nlevel to phoneme level. It contains three main components: (1) A multimodal\nstyle adaptor operating at the phoneme level to learn pronunciation style from\nthe reference audio, and generate intermediate representations informed by the\nfacial emotion presented in the video; (2) An utterance-level style learning\nmodule, which guides both the mel-spectrogram decoding and the refining\nprocesses from the intermediate embeddings to improve the overall style\nexpression; And (3) a phoneme-guided lip aligner to maintain lip sync.\nExtensive experiments on two of the primary benchmarks, V2C and Grid,\ndemonstrate the favorable performance of the proposed method as compared to the\ncurrent stateof-the-art. The code will be made available at\nhttps://github.com/GalaxyCong/StyleDubber.\n","authors":["Gaoxiang Cong","Yuankai Qi","Liang Li","Amin Beheshti","Zhedong Zhang","Anton van den Hengel","Ming-Hsuan Yang","Chenggang Yan","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2402.12636v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13107v2","updated":"2024-07-02T03:35:53Z","published":"2024-03-19T19:15:13Z","title":"Towards Unsupervised Question Answering System with Multi-level\n Summarization for Legal Text","summary":" This paper summarizes Team SCaLAR's work on SemEval-2024 Task 5: Legal\nArgument Reasoning in Civil Procedure. To address this Binary Classification\ntask, which was daunting due to the complexity of the Legal Texts involved, we\npropose a simple yet novel similarity and distance-based unsupervised approach\nto generate labels. Further, we explore the Multi-level fusion of Legal-Bert\nembeddings using ensemble features, including CNN, GRU, and LSTM. To address\nthe lengthy nature of Legal explanation in the dataset, we introduce T5-based\nsegment-wise summarization, which successfully retained crucial information,\nenhancing the model's performance. Our unsupervised system witnessed a 20-point\nincrease in macro F1-score on the development set and a 10-point increase on\nthe test set, which is promising given its uncomplicated architecture.\n","authors":["M Manvith Prabhu","Haricharana Srinivasa","Anand Kumar M"],"pdf_url":"https://arxiv.org/pdf/2403.13107v2.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2407.01920v1","updated":"2024-07-02T03:34:16Z","published":"2024-07-02T03:34:16Z","title":"To Forget or Not? Towards Practical Knowledge Unlearning for Large\n Language Models","summary":" Large Language Models (LLMs) trained on extensive corpora inevitably retain\nsensitive data, such as personal privacy information and copyrighted material.\nRecent advancements in knowledge unlearning involve updating LLM parameters to\nerase specific knowledge. However, current unlearning paradigms are mired in\nvague forgetting boundaries, often erasing knowledge indiscriminately. In this\nwork, we introduce KnowUnDo, a benchmark containing copyrighted content and\nuser privacy domains to evaluate if the unlearning process inadvertently erases\nessential knowledge. Our findings indicate that existing unlearning methods\noften suffer from excessive unlearning. To address this, we propose a simple\nyet effective method, MemFlex, which utilizes gradient information to precisely\ntarget and unlearn sensitive parameters. Experimental results show that MemFlex\nis superior to existing methods in both precise knowledge unlearning and\ngeneral knowledge retaining of LLMs. Code and dataset will be released at\nhttps://github.com/zjunlp/KnowUnDo.\n","authors":["Bozhong Tian","Xiaozhuan Liang","Siyuan Cheng","Qingbin Liu","Mengru Wang","Dianbo Sui","Xi Chen","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01920v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2402.11469v2","updated":"2024-07-02T03:29:11Z","published":"2024-02-18T05:58:25Z","title":"A Curious Case of Searching for the Correlation between Training Data\n and Adversarial Robustness of Transformer Textual Models","summary":" Existing works have shown that fine-tuned textual transformer models achieve\nstate-of-the-art prediction performances but are also vulnerable to adversarial\ntext perturbations. Traditional adversarial evaluation is often done\n\\textit{only after} fine-tuning the models and ignoring the training data. In\nthis paper, we want to prove that there is also a strong correlation between\ntraining data and model robustness. To this end, we extract 13 different\nfeatures representing a wide range of input fine-tuning corpora properties and\nuse them to predict the adversarial robustness of the fine-tuned models.\nFocusing mostly on encoder-only transformer models BERT and RoBERTa with\nadditional results for BART, ELECTRA, and GPT2, we provide diverse evidence to\nsupport our argument. First, empirical analyses show that (a) extracted\nfeatures can be used with a lightweight classifier such as Random Forest to\npredict the attack success rate effectively, and (b) features with the most\ninfluence on the model robustness have a clear correlation with the robustness.\nSecond, our framework can be used as a fast and effective additional tool for\nrobustness evaluation since it (a) saves 30x-193x runtime compared to the\ntraditional technique, (b) is transferable across models, (c) can be used under\nadversarial training, and (d) robust to statistical randomness. Our code is\npublicly available at \\url{https://github.com/CaptainCuong/RobustText_ACL2024}.\n","authors":["Cuong Dang","Dung D. Le","Thai Le"],"pdf_url":"https://arxiv.org/pdf/2402.11469v2.pdf","comment":"Accepted to ACL Findings 2024"},{"id":"http://arxiv.org/abs/2401.11458v3","updated":"2024-07-02T03:24:29Z","published":"2024-01-21T10:46:23Z","title":"Linear Alignment: A Closed-form Solution for Aligning Human Preferences\n without Tuning and Feedback","summary":" The success of AI assistants based on Language Models (LLMs) hinges on\nReinforcement Learning from Human Feedback (RLHF) to comprehend and align with\nuser intentions. However, traditional alignment algorithms, such as PPO, are\nhampered by complex annotation and training requirements. This reliance limits\nthe applicability of RLHF and hinders the development of professional\nassistants tailored to diverse human preferences. In this work, we introduce\n\\textit{Linear Alignment}, a novel algorithm that aligns language models with\nhuman preferences in one single inference step, eliminating the reliance on\ndata annotation and model training. Linear alignment incorporates a new\nparameterization for policy optimization under divergence constraints, which\nenables the extraction of optimal policy in a closed-form manner and\nfacilitates the direct estimation of the aligned response. Extensive\nexperiments on both general and personalized preference datasets demonstrate\nthat linear alignment significantly enhances the performance and efficiency of\nLLM alignment across diverse scenarios. Our code and dataset is published on\n\\url{https://github.com/Wizardcoast/Linear_Alignment.git}.\n","authors":["Songyang Gao","Qiming Ge","Wei Shen","Shihan Dou","Junjie Ye","Xiao Wang","Rui Zheng","Yicheng Zou","Zhi Chen","Hang Yan","Qi Zhang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11458v3.pdf","comment":"Accepted by ICML2024, I'm still preparing a better vision"},{"id":"http://arxiv.org/abs/2407.01911v1","updated":"2024-07-02T03:22:41Z","published":"2024-07-02T03:22:41Z","title":"Investigating the Effects of Large-Scale Pseudo-Stereo Data and\n Different Speech Foundation Model on Dialogue Generative Spoken Language\n Model","summary":" Recent efforts in Spoken Dialogue Modeling aim to synthesize spoken dialogue\nwithout the need for direct transcription, thereby preserving the wealth of\nnon-textual information inherent in speech. However, this approach faces a\nchallenge when speakers talk simultaneously, requiring stereo dialogue data\nwith speakers recorded on separate channels, a notably scarce resource. To\naddress this, we have developed an innovative pipeline capable of transforming\nsingle-channel dialogue data into pseudo-stereo data. This expanded our\ntraining dataset from a mere 2,000 to an impressive 17,600 hours, significantly\nenriching the diversity and quality of the training examples available. The\ninclusion of this pseudo-stereo data has proven to be effective in improving\nthe performance of spoken dialogue language models. Additionally, we explored\nthe use of discrete units of different speech foundation models for spoken\ndialogue generation.\n","authors":["Yu-Kuan Fu","Cheng-Kuang Lee","Hsiu-Hsuan Wang","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2407.01911v1.pdf","comment":"submitted to interspeech 2024"},{"id":"http://arxiv.org/abs/2407.01909v1","updated":"2024-07-02T03:16:47Z","published":"2024-07-02T03:16:47Z","title":"Pinyin Regularization in Error Correction for Chinese Speech Recognition\n with Large Language Models","summary":" Recent studies have demonstrated the efficacy of large language models (LLMs)\nin error correction for automatic speech recognition (ASR). However, much of\nthe research focuses on the English language. This paper redirects the\nattention to Chinese. Firstly, we construct a specialized benchmark dataset\naimed at error correction for Chinese ASR with 724K hypotheses-transcription\npairs, named the Chinese Hypotheses Paradise dataset (ChineseHP), which\ncontains a wide range of scenarios and presents significant challenges.\nSubsequently, we conduct a preliminary evaluation using the dataset for both\ndirect-prompting and fine-tuning pre-trained LLMs. Furthermore, we propose a\nstraightforward method of Pinyin regularization for prompts, which involves the\ntranscription of Pinyin directly from text hypotheses. The experimental results\nreveal that Pinyin regularization consistently enhances the error-correcting\nability of LLMs when compared with those without regularization. The dataset is\navailable on the website.\n","authors":["Zhiyuan Tang","Dong Wang","Shen Huang","Shidong Shang"],"pdf_url":"https://arxiv.org/pdf/2407.01909v1.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2407.01906v1","updated":"2024-07-02T03:11:13Z","published":"2024-07-02T03:11:13Z","title":"Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning for\n Sparse Architectural Large Language Models","summary":" Parameter-efficient fine-tuning (PEFT) is crucial for customizing Large\nLanguage Models (LLMs) with constrained resources. Although there have been\nvarious PEFT methods for dense-architecture LLMs, PEFT for sparse-architecture\nLLMs is still underexplored. In this work, we study the PEFT method for LLMs\nwith the Mixture-of-Experts (MoE) architecture and the contents of this work\nare mainly threefold: (1) We investigate the dispersion degree of the activated\nexperts in customized tasks, and found that the routing distribution for a\nspecific task tends to be highly concentrated, while the distribution of\nactivated experts varies significantly across different tasks. (2) We propose\nExpert-Specialized Fine-Tuning, or ESFT, which tunes the experts most relevant\nto downstream tasks while freezing the other experts and modules; experimental\nresults demonstrate that our method not only improves the tuning efficiency,\nbut also matches or even surpasses the performance of full-parameter\nfine-tuning. (3) We further analyze the impact of the MoE architecture on\nexpert-specialized fine-tuning. We find that MoE models with finer-grained\nexperts are more advantageous in selecting the combination of experts that are\nmost relevant to downstream tasks, thereby enhancing both the training\nefficiency and effectiveness.\n","authors":["Zihan Wang","Deli Chen","Damai Dai","Runxin Xu","Zhuoshu Li","Y. Wu"],"pdf_url":"https://arxiv.org/pdf/2407.01906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01902v1","updated":"2024-07-02T02:58:29Z","published":"2024-07-02T02:58:29Z","title":"SoP: Unlock the Power of Social Facilitation for Automatic Jailbreak\n Attack","summary":" The widespread applications of large language models (LLMs) have brought\nabout concerns regarding their potential misuse. Although aligned with human\npreference data before release, LLMs remain vulnerable to various malicious\nattacks. In this paper, we adopt a red-teaming strategy to enhance LLM safety\nand introduce SoP, a simple yet effective framework to design jailbreak prompts\nautomatically. Inspired by the social facilitation concept, SoP generates and\noptimizes multiple jailbreak characters to bypass the guardrails of the target\nLLM. Different from previous work which relies on proprietary LLMs or seed\njailbreak templates crafted by human expertise, SoP can generate and optimize\nthe jailbreak prompt in a cold-start scenario using open-sourced LLMs without\nany seed jailbreak templates. Experimental results show that SoP achieves\nattack success rates of 88% and 60% in bypassing the safety alignment of\nGPT-3.5-1106 and GPT-4, respectively. Furthermore, we extensively evaluate the\ntransferability of the generated templates across different LLMs and held-out\nmalicious requests, while also exploring defense strategies against the\njailbreak attack designed by SoP. Code is available at\nhttps://github.com/Yang-Yan-Yang-Yan/SoP.\n","authors":["Yan Yang","Zeguan Xiao","Xin Lu","Hongru Wang","Hailiang Huang","Guanhua Chen","Yun Chen"],"pdf_url":"https://arxiv.org/pdf/2407.01902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12624v2","updated":"2024-07-02T02:57:01Z","published":"2024-06-18T13:49:54Z","title":"Judging the Judges: Evaluating Alignment and Vulnerabilities in\n LLMs-as-Judges","summary":" Offering a promising solution to the scalability challenges associated with\nhuman evaluation, the LLM-as-a-judge paradigm is rapidly gaining traction as an\napproach to evaluating large language models (LLMs). However, there are still\nmany open questions about the strengths and weaknesses of this paradigm, and\nwhat potential biases it may hold. In this paper, we present a comprehensive\nstudy of the performance of various LLMs acting as judges. We leverage TriviaQA\nas a benchmark for assessing objective knowledge reasoning of LLMs and evaluate\nthem alongside human annotations which we found to have a high inter-annotator\nagreement. Our study includes 9 judge models and 9 exam taker models -- both\nbase and instruction-tuned. We assess the judge model's alignment across\ndifferent model sizes, families, and judge prompts. Among other results, our\nresearch rediscovers the importance of using Cohen's kappa as a metric of\nalignment as opposed to simple percent agreement, showing that judges with high\npercent agreement can still assign vastly different scores. We find that both\nLlama-3 70B and GPT-4 Turbo have an excellent alignment with humans, but in\nterms of ranking exam taker models, they are outperformed by both JudgeLM-7B\nand the lexical judge Contains, which have up to 34 points lower human\nalignment. Through error analysis and various other studies, including the\neffects of instruction length and leniency bias, we hope to provide valuable\nlessons for using LLMs as judges in the future.\n","authors":["Aman Singh Thakur","Kartik Choudhary","Venkat Srinik Ramayapally","Sankaran Vaidyanathan","Dieuwke Hupkes"],"pdf_url":"https://arxiv.org/pdf/2406.12624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01899v1","updated":"2024-07-02T02:50:15Z","published":"2024-07-02T02:50:15Z","title":"Scope-enhanced Compositional Semantic Parsing for DRT","summary":" Discourse Representation Theory (DRT) distinguishes itself from other\nsemantic representation frameworks by its ability to model complex semantic and\ndiscourse phenomena through structural nesting and variable binding. While\nseq2seq models hold the state of the art on DRT parsing, their accuracy\ndegrades with the complexity of the sentence, and they sometimes struggle to\nproduce well-formed DRT representations. We introduce the AMS parser, a\ncompositional, neurosymbolic semantic parser for DRT. It rests on a novel\nmechanism for predicting quantifier scope. We show that the AMS parser reliably\nproduces well-formed outputs and performs well on DRT parsing, especially on\ncomplex sentences.\n","authors":["Xiulin Yang","Jonas Groschwitz","Alexander Koller","Johan Bos"],"pdf_url":"https://arxiv.org/pdf/2407.01899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01897v1","updated":"2024-07-02T02:42:29Z","published":"2024-07-02T02:42:29Z","title":"Proposal Report for the 2nd SciCAP Competition 2024","summary":" In this paper, we propose a method for document summarization using auxiliary\ninformation. This approach effectively summarizes descriptions related to\nspecific images, tables, and appendices within lengthy texts. Our experiments\ndemonstrate that leveraging high-quality OCR data and initially extracted\ninformation from the original text enables efficient summarization of the\ncontent related to described objects. Based on these findings, we enhanced\npopular text generation model models by incorporating additional auxiliary\nbranches to improve summarization performance. Our method achieved top scores\nof 4.33 and 4.66 in the long caption and short caption tracks, respectively, of\nthe 2024 SciCAP competition, ranking highest in both categories.\n","authors":["Pengpeng Li","Tingmin Li","Jingyuan Wang","Boyuan Wang","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01896v1","updated":"2024-07-02T02:39:33Z","published":"2024-07-02T02:39:33Z","title":"LogEval: A Comprehensive Benchmark Suite for Large Language Models In\n Log Analysis","summary":" Log analysis is crucial for ensuring the orderly and stable operation of\ninformation systems, particularly in the field of Artificial Intelligence for\nIT Operations (AIOps). Large Language Models (LLMs) have demonstrated\nsignificant potential in natural language processing tasks. In the AIOps\ndomain, they excel in tasks such as anomaly detection, root cause analysis of\nfaults, operations and maintenance script generation, and alert information\nsummarization. However, the performance of current LLMs in log analysis tasks\nremains inadequately validated. To address this gap, we introduce LogEval, a\ncomprehensive benchmark suite designed to evaluate the capabilities of LLMs in\nvarious log analysis tasks for the first time. This benchmark covers tasks such\nas log parsing, log anomaly detection, log fault diagnosis, and log\nsummarization. LogEval evaluates each task using 4,000 publicly available log\ndata entries and employs 15 different prompts for each task to ensure a\nthorough and fair assessment. By rigorously evaluating leading LLMs, we\ndemonstrate the impact of various LLM technologies on log analysis performance,\nfocusing on aspects such as self-consistency and few-shot contextual learning.\nWe also discuss findings related to model quantification, Chinese-English\nquestion-answering evaluation, and prompt engineering. These findings provide\ninsights into the strengths and weaknesses of LLMs in multilingual environments\nand the effectiveness of different prompt strategies. Various evaluation\nmethods are employed for different tasks to accurately measure the performance\nof LLMs in log analysis, ensuring a comprehensive assessment. The insights\ngained from LogEvals evaluation reveal the strengths and limitations of LLMs in\nlog analysis tasks, providing valuable guidance for researchers and\npractitioners.\n","authors":["Tianyu Cui","Shiyu Ma","Ziang Chen","Tong Xiao","Shimin Tao","Yilun Liu","Shenglin Zhang","Duoming Lin","Changchang Liu","Yuzhe Cai","Weibin Meng","Yongqian Sun","Dan Pei"],"pdf_url":"https://arxiv.org/pdf/2407.01896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01892v1","updated":"2024-07-02T02:27:46Z","published":"2024-07-02T02:27:46Z","title":"GRASP: A Grid-Based Benchmark for Evaluating Commonsense Spatial\n Reasoning","summary":" Spatial reasoning, an important faculty of human cognition with many\npractical applications, is one of the core commonsense skills that is not\npurely language-based and, for satisfying (as opposed to optimal) solutions,\nrequires some minimum degree of planning. Existing benchmarks of Commonsense\nSpatial Reasoning (CSR) tend to evaluate how Large Language Models (LLMs)\ninterpret text-based spatial descriptions rather than directly evaluate a plan\nproduced by the LLM in response to a spatial reasoning scenario. In this paper,\nwe construct a large-scale benchmark called $\\textbf{GRASP}$, which consists of\n16,000 grid-based environments where the agent is tasked with an energy\ncollection problem. These environments include 100 grid instances instantiated\nusing each of the 160 different grid settings, involving five different energy\ndistributions, two modes of agent starting position, and two distinct obstacle\nconfigurations, as well as three kinds of agent constraints. Using GRASP, we\ncompare classic baseline approaches, such as random walk and greedy search\nmethods, with advanced LLMs like GPT-3.5-Turbo and GPT-4o. The experimental\nresults indicate that even these advanced LLMs struggle to consistently achieve\nsatisfactory solutions.\n","authors":["Zhisheng Tang","Mayank Kejriwal"],"pdf_url":"https://arxiv.org/pdf/2407.01892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01887v1","updated":"2024-07-02T02:18:14Z","published":"2024-07-02T02:18:14Z","title":"Beyond Numeric Awards: In-Context Dueling Bandits with LLM Agents","summary":" In-context decision-making is an important capability of artificial general\nintelligence, which Large Language Models (LLMs) have effectively demonstrated\nin various scenarios. However, LLMs often face challenges when dealing with\nnumerical contexts, and limited attention has been paid to evaluating their\nperformance through preference feedback generated by the environment. This\npaper investigates the performance of LLMs as decision-makers in the context of\nDueling Bandits (DB). We first evaluate the performance of LLMs by comparing\nGPT-3.5-Turbo, GPT-4, and GPT-4-Turbo against established DB algorithms. Our\nresults reveal that LLMs, particularly GPT-4 Turbo, quickly identify the\nCondorcet winner, thus outperforming existing state-of-the-art algorithms in\nterms of weak regret. Nevertheless, LLMs struggle to converge even when\nexplicitly prompted to do so, and are sensitive to prompt variations. To\novercome these issues, we introduce an LLM-augmented algorithm, IF-Enhanced\nLLM, which takes advantage of both in-context decision-making capabilities of\nLLMs and theoretical guarantees inherited from classic DB algorithms. The\ndesign of such an algorithm sheds light on how to enhance trustworthiness for\nLLMs used in decision-making tasks where performance robustness matters. We\nshow that IF-Enhanced LLM has theoretical guarantees on both weak and strong\nregret. Our experimental results validate that IF-Enhanced LLM is robust even\nwith noisy and adversarial prompts.\n","authors":["Fanzeng Xia","Hao Liu","Yisong Yue","Tongxin Li"],"pdf_url":"https://arxiv.org/pdf/2407.01887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09559v3","updated":"2024-07-02T02:16:28Z","published":"2023-11-16T04:29:41Z","title":"Prompt-based Pseudo-labeling Strategy for Sample-Efficient\n Semi-Supervised Extractive Summarization","summary":" Semi-supervised learning (SSL) is a widely used technique in scenarios where\nlabeled data is scarce and unlabeled data is abundant. While SSL is popular for\nimage and text classification, it is relatively underexplored for the task of\nextractive text summarization. Standard SSL methods follow a teacher-student\nparadigm to first train a classification model and then use the classifier's\nconfidence values to select pseudo-labels for the subsequent training cycle;\nhowever, such classifiers are not suitable to measure the accuracy of\npseudo-labels as they lack specific tuning for evaluation, which leads to\nconfidence values that fail to capture the semantics and correctness of the\ngenerated summary. To address this problem, we propose a prompt-based\npseudo-labeling strategy with LLMs that picks unlabeled examples with more\naccurate pseudo-labels than using just the classifier's probability outputs.\nOur approach also includes a relabeling mechanism that improves the quality of\npseudo-labels. We evaluate our method on three text summarization datasets:\nTweetSumm, WikiHow, and ArXiv/PubMed. We empirically show that a\nprompting-based LLM that scores and generates pseudo-labels outperforms\nexisting SSL methods on ROUGE-1, ROUGE-2, and ROUGE-L scores on all the\ndatasets. Furthermore, our method achieves competitive L-Eval scores\n(evaluation with LLaMa-3) as a fully supervised method in a data-scarce setting\nand outperforms fully supervised method in a data-abundant setting.\n","authors":["Gaurav Sahu","Olga Vechtomova","Issam H. Laradji"],"pdf_url":"https://arxiv.org/pdf/2311.09559v3.pdf","comment":"8 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2402.08787v4","updated":"2024-07-02T02:15:50Z","published":"2024-02-13T20:51:58Z","title":"Rethinking Machine Unlearning for Large Language Models","summary":" We explore machine unlearning (MU) in the domain of large language models\n(LLMs), referred to as LLM unlearning. This initiative aims to eliminate\nundesirable data influence (e.g., sensitive or illegal information) and the\nassociated model capabilities, while maintaining the integrity of essential\nknowledge generation and not affecting causally unrelated information. We\nenvision LLM unlearning becoming a pivotal element in the life-cycle management\nof LLMs, potentially standing as an essential foundation for developing\ngenerative AI that is not only safe, secure, and trustworthy, but also\nresource-efficient without the need of full retraining. We navigate the\nunlearning landscape in LLMs from conceptual formulation, methodologies,\nmetrics, and applications. In particular, we highlight the often-overlooked\naspects of existing LLM unlearning research, e.g., unlearning scope, data-model\ninteraction, and multifaceted efficacy assessment. We also draw connections\nbetween LLM unlearning and related areas such as model editing, influence\nfunctions, model explanation, adversarial training, and reinforcement learning.\nFurthermore, we outline an effective assessment framework for LLM unlearning\nand explore its applications in copyright and privacy safeguards and\nsociotechnical harm reduction.\n","authors":["Sijia Liu","Yuanshun Yao","Jinghan Jia","Stephen Casper","Nathalie Baracaldo","Peter Hase","Yuguang Yao","Chris Yuhao Liu","Xiaojun Xu","Hang Li","Kush R. Varshney","Mohit Bansal","Sanmi Koyejo","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.08787v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01885v1","updated":"2024-07-02T02:14:42Z","published":"2024-07-02T02:14:42Z","title":"Survey on Knowledge Distillation for Large Language Models: Methods,\n Evaluation, and Application","summary":" Large Language Models (LLMs) have showcased exceptional capabilities in\nvarious domains, attracting significant interest from both academia and\nindustry. Despite their impressive performance, the substantial size and\ncomputational demands of LLMs pose considerable challenges for practical\ndeployment, particularly in environments with limited resources. The endeavor\nto compress language models while maintaining their accuracy has become a focal\npoint of research. Among the various methods, knowledge distillation has\nemerged as an effective technique to enhance inference speed without greatly\ncompromising performance. This paper presents a thorough survey from three\naspects: method, evaluation, and application, exploring knowledge distillation\ntechniques tailored specifically for LLMs. Specifically, we divide the methods\ninto white-box KD and black-box KD to better illustrate their differences.\nFurthermore, we also explored the evaluation tasks and distillation effects\nbetween different distillation methods, and proposed directions for future\nresearch. Through in-depth understanding of the latest advancements and\npractical applications, this survey provides valuable resources for\nresearchers, paving the way for sustained progress in this field.\n","authors":["Chuanpeng Yang","Wang Lu","Yao Zhu","Yidong Wang","Qian Chen","Chenlong Gao","Bingjie Yan","Yiqiang Chen"],"pdf_url":"https://arxiv.org/pdf/2407.01885v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2406.09411v2","updated":"2024-07-02T01:56:14Z","published":"2024-06-13T17:59:52Z","title":"MuirBench: A Comprehensive Benchmark for Robust Multi-image\n Understanding","summary":" We introduce MuirBench, a comprehensive benchmark that focuses on robust\nmulti-image understanding capabilities of multimodal LLMs. MuirBench consists\nof 12 diverse multi-image tasks (e.g., scene understanding, ordering) that\ninvolve 10 categories of multi-image relations (e.g., multiview, temporal\nrelations). Comprising 11,264 images and 2,600 multiple-choice questions,\nMuirBench is created in a pairwise manner, where each standard instance is\npaired with an unanswerable variant that has minimal semantic differences, in\norder for a reliable assessment. Evaluated upon 20 recent multi-modal LLMs, our\nresults reveal that even the best-performing models like GPT-4o and Gemini Pro\nfind it challenging to solve MuirBench, achieving 68.0% and 49.3% in accuracy.\nOpen-source multimodal LLMs trained on single images can hardly generalize to\nmulti-image questions, hovering below 33.3% in accuracy. These results\nhighlight the importance of MuirBench in encouraging the community to develop\nmultimodal LLMs that can look beyond a single image, suggesting potential\npathways for future improvements.\n","authors":["Fei Wang","Xingyu Fu","James Y. Huang","Zekun Li","Qin Liu","Xiaogeng Liu","Mingyu Derek Ma","Nan Xu","Wenxuan Zhou","Kai Zhang","Tianyi Lorena Yan","Wenjie Jacky Mo","Hsiang-Hui Liu","Pan Lu","Chunyuan Li","Chaowei Xiao","Kai-Wei Chang","Dan Roth","Sheng Zhang","Hoifung Poon","Muhao Chen"],"pdf_url":"https://arxiv.org/pdf/2406.09411v2.pdf","comment":"typos corrected, references added, Project Page:\n https://muirbench.github.io/"},{"id":"http://arxiv.org/abs/2401.12963v2","updated":"2024-07-02T01:52:26Z","published":"2024-01-23T18:45:54Z","title":"AutoRT: Embodied Foundation Models for Large Scale Orchestration of\n Robotic Agents","summary":" Foundation models that incorporate language, vision, and more recently\nactions have revolutionized the ability to harness internet scale data to\nreason about useful tasks. However, one of the key challenges of training\nembodied foundation models is the lack of data grounded in the physical world.\nIn this paper, we propose AutoRT, a system that leverages existing foundation\nmodels to scale up the deployment of operational robots in completely unseen\nscenarios with minimal human supervision. AutoRT leverages vision-language\nmodels (VLMs) for scene understanding and grounding, and further uses large\nlanguage models (LLMs) for proposing diverse and novel instructions to be\nperformed by a fleet of robots. Guiding data collection by tapping into the\nknowledge of foundation models enables AutoRT to effectively reason about\nautonomy tradeoffs and safety while significantly scaling up data collection\nfor robot learning. We demonstrate AutoRT proposing instructions to over 20\nrobots across multiple buildings and collecting 77k real robot episodes via\nboth teleoperation and autonomous robot policies. We experimentally show that\nsuch \"in-the-wild\" data collected by AutoRT is significantly more diverse, and\nthat AutoRT's use of LLMs allows for instruction following data collection\nrobots that can align to human preferences.\n","authors":["Michael Ahn","Debidatta Dwibedi","Chelsea Finn","Montse Gonzalez Arenas","Keerthana Gopalakrishnan","Karol Hausman","Brian Ichter","Alex Irpan","Nikhil Joshi","Ryan Julian","Sean Kirmani","Isabel Leal","Edward Lee","Sergey Levine","Yao Lu","Isabel Leal","Sharath Maddineni","Kanishka Rao","Dorsa Sadigh","Pannag Sanketi","Pierre Sermanet","Quan Vuong","Stefan Welker","Fei Xia","Ted Xiao","Peng Xu","Steve Xu","Zhuo Xu"],"pdf_url":"https://arxiv.org/pdf/2401.12963v2.pdf","comment":"26 pages, 9 figures, ICRA 2024 VLMNM Workshop"},{"id":"http://arxiv.org/abs/2305.13712v3","updated":"2024-07-02T01:39:50Z","published":"2023-05-23T05:59:21Z","title":"Knowledge of Knowledge: Exploring Known-Unknowns Uncertainty with Large\n Language Models","summary":" This paper investigates the capabilities of Large Language Models (LLMs) in\nthe context of understanding their knowledge and uncertainty over questions.\nSpecifically, we focus on addressing known-unknown questions, characterized by\nhigh uncertainty due to the absence of definitive answers. To facilitate our\nstudy, we collect a new dataset with Known-Unknown Questions (KUQ) and\nestablish a categorization framework to clarify the origins of uncertainty in\nsuch queries. Subsequently, we examine the performance of open-source LLMs,\nfine-tuned using this dataset, in distinguishing between known and unknown\nqueries within open-ended question-answering scenarios. The fine-tuned models\ndemonstrated a significant improvement, achieving a considerable increase in\nF1-score relative to their pre-fine-tuning state. Through a comprehensive\nanalysis, we reveal insights into the models' improved uncertainty articulation\nand their consequent efficacy in multi-agent debates. These findings help us\nunderstand how LLMs can be trained to identify and express uncertainty,\nimproving our knowledge of how they understand and express complex or unclear\ninformation.\n","authors":["Alfonso Amayuelas","Kyle Wong","Liangming Pan","Wenhu Chen","William Wang"],"pdf_url":"https://arxiv.org/pdf/2305.13712v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01878v1","updated":"2024-07-02T01:37:56Z","published":"2024-07-02T01:37:56Z","title":"Compare without Despair: Reliable Preference Evaluation with Generation\n Separability","summary":" Human evaluation of generated language through pairwise preference judgments\nis pervasive. However, under common scenarios, such as when generations from a\nmodel pair are very similar, or when stochastic decoding results in large\nvariations in generations, it results in inconsistent preference ratings. We\naddress these challenges by introducing a meta-evaluation measure,\nseparability, which estimates how suitable a test instance is for pairwise\npreference evaluation. For a candidate test instance, separability samples\nmultiple generations from a pair of models, and measures how distinguishable\nthe two sets of generations are. Our experiments show that instances with high\nseparability values yield more consistent preference ratings from both human-\nand auto-raters. Further, the distribution of separability allows insights into\nwhich test benchmarks are more valuable for comparing models. Finally, we\nincorporate separability into ELO ratings, accounting for how suitable each\ntest instance might be for reliably ranking LLMs. Overall, separability has\nimplications for consistent, efficient and robust preference evaluation of LLMs\nwith both human- and auto-raters.\n","authors":["Sayan Ghosh","Tejas Srinivasan","Swabha Swayamdipta"],"pdf_url":"https://arxiv.org/pdf/2407.01878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01873v1","updated":"2024-07-02T01:17:01Z","published":"2024-07-02T01:17:01Z","title":"Automated Text Scoring in the Age of Generative AI for the GPU-poor","summary":" Current research on generative language models (GLMs) for automated text\nscoring (ATS) has focused almost exclusively on querying proprietary models via\nApplication Programming Interfaces (APIs). Yet such practices raise issues\naround transparency and security, and these methods offer little in the way of\nefficiency or customizability. With the recent proliferation of smaller,\nopen-source models, there is the option to explore GLMs with computers equipped\nwith modest, consumer-grade hardware, that is, for the \"GPU poor.\" In this\nstudy, we analyze the performance and efficiency of open-source, small-scale\nGLMs for ATS. Results show that GLMs can be fine-tuned to achieve adequate,\nthough not state-of-the-art, performance. In addition to ATS, we take small\nsteps towards analyzing models' capacity for generating feedback by prompting\nGLMs to explain their scores. Model-generated feedback shows promise, but\nrequires more rigorous evaluation focused on targeted use cases.\n","authors":["Christopher Michael Ormerod","Alexander Kwako"],"pdf_url":"https://arxiv.org/pdf/2407.01873v1.pdf","comment":"21 pages, 1 figure"},{"id":"http://arxiv.org/abs/2407.01863v1","updated":"2024-07-02T00:24:01Z","published":"2024-07-02T00:24:01Z","title":"VSP: Assessing the dual challenges of perception and reasoning in\n spatial planning tasks for VLMs","summary":" Vision language models (VLMs) are an exciting emerging class of language\nmodels (LMs) that have merged classic LM capabilities with those of image\nprocessing systems. However, the ways that these capabilities combine are not\nalways intuitive and warrant direct investigation. One understudied capability\nin VLMs is visual spatial planning -- the ability to comprehend the spatial\narrangements of objects and devise action plans to achieve desired outcomes in\nvisual scenes. In our study, we introduce VSP, a benchmark that 1) evaluates\nthe spatial planning capability in these models in general, and 2) breaks down\nthe visual planning task into finer-grained sub-tasks, including perception and\nreasoning, and measure the LMs capabilities in these sub-tasks. Our evaluation\nshows that both open-source and private VLMs fail to generate effective plans\nfor even simple spatial planning tasks. Evaluations on the fine-grained\nanalytical tasks further reveal fundamental deficiencies in the models' visual\nperception and bottlenecks in reasoning abilities, explaining their worse\nperformance in the general spatial planning tasks. Our work illuminates future\ndirections for improving VLMs' abilities in spatial planning. Our benchmark is\npublicly available at\nhttps://github.com/UCSB-NLP-Chang/Visual-Spatial-Planning.\n","authors":["Qiucheng Wu","Handong Zhao","Michael Saxon","Trung Bui","William Yang Wang","Yang Zhang","Shiyu Chang"],"pdf_url":"https://arxiv.org/pdf/2407.01863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16248v3","updated":"2024-07-02T23:34:35Z","published":"2023-10-24T23:45:57Z","title":"GlotLID: Language Identification for Low-Resource Languages","summary":" Several recent papers have published good solutions for language\nidentification (LID) for about 300 high-resource and medium-resource languages.\nHowever, there is no LID available that (i) covers a wide range of low-resource\nlanguages, (ii) is rigorously evaluated and reliable and (iii) efficient and\neasy to use. Here, we publish GlotLID-M, an LID model that satisfies the\ndesiderata of wide coverage, reliability and efficiency. It identifies 1665\nlanguages, a large increase in coverage compared to prior work. In our\nexperiments, GlotLID-M outperforms four baselines (CLD3, FT176, OpenLID and\nNLLB) when balancing F1 and false positive rate (FPR). We analyze the unique\nchallenges that low-resource LID poses: incorrect corpus metadata, leakage from\nhigh-resource languages, difficulty separating closely related languages,\nhandling of macrolanguage vs varieties and in general noisy data. We hope that\nintegrating GlotLID-M into dataset creation pipelines will improve quality and\nenhance accessibility of NLP technology for low-resource languages and\ncultures. GlotLID-M model (including future versions), code, and list of data\nsources are available: https://github.com/cisnlp/GlotLID.\n","authors":["Amir Hossein Kargaran","Ayyoob Imani","François Yvon","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2310.16248v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2407.02694v1","updated":"2024-07-02T22:23:40Z","published":"2024-07-02T22:23:40Z","title":"LLM-Select: Feature Selection with Large Language Models","summary":" In this paper, we demonstrate a surprising capability of large language\nmodels (LLMs): given only input feature names and a description of a prediction\ntask, they are capable of selecting the most predictive features, with\nperformance rivaling the standard tools of data science. Remarkably, these\nmodels exhibit this capacity across various query mechanisms. For example, we\nzero-shot prompt an LLM to output a numerical importance score for a feature\n(e.g., \"blood pressure\") in predicting an outcome of interest (e.g., \"heart\nfailure\"), with no additional context. In particular, we find that the latest\nmodels, such as GPT-4, can consistently identify the most predictive features\nregardless of the query mechanism and across various prompting strategies. We\nillustrate these findings through extensive experiments on real-world data,\nwhere we show that LLM-based feature selection consistently achieves strong\nperformance competitive with data-driven methods such as the LASSO, despite\nnever having looked at the downstream training data. Our findings suggest that\nLLMs may be useful not only for selecting the best features for training but\nalso for deciding which features to collect in the first place. This could\npotentially benefit practitioners in domains like healthcare, where collecting\nhigh-quality data comes at a high cost.\n","authors":["Daniel P. Jeong","Zachary C. Lipton","Pradeep Ravikumar"],"pdf_url":"https://arxiv.org/pdf/2407.02694v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.02678v1","updated":"2024-07-02T21:39:53Z","published":"2024-07-02T21:39:53Z","title":"Reasoning in Large Language Models: A Geometric Perspective","summary":" The advancement of large language models (LLMs) for real-world applications\nhinges critically on enhancing their reasoning capabilities. In this work, we\nexplore the reasoning abilities of large language models (LLMs) through their\ngeometrical understanding. We establish a connection between the expressive\npower of LLMs and the density of their self-attention graphs. Our analysis\ndemonstrates that the density of these graphs defines the intrinsic dimension\nof the inputs to the MLP blocks. We demonstrate through theoretical analysis\nand toy examples that a higher intrinsic dimension implies a greater expressive\ncapacity of the LLM. We further provide empirical evidence linking this\ngeometric framework to recent advancements in methods aimed at enhancing the\nreasoning capabilities of LLMs.\n","authors":["Romain Cosentino","Sarath Shekkizhar"],"pdf_url":"https://arxiv.org/pdf/2407.02678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03832v3","updated":"2024-07-02T21:23:32Z","published":"2024-05-06T20:30:14Z","title":"Guylingo: The Republic of Guyana Creole Corpora","summary":" While major languages often enjoy substantial attention and resources, the\nlinguistic diversity across the globe encompasses a multitude of smaller,\nindigenous, and regional languages that lack the same level of computational\nsupport. One such region is the Caribbean. While commonly labeled as \"English\nspeaking\", the ex-British Caribbean region consists of a myriad of Creole\nlanguages thriving alongside English. In this paper, we present Guylingo: a\ncomprehensive corpus designed for advancing NLP research in the domain of\nCreolese (Guyanese English-lexicon Creole), the most widely spoken language in\nthe culturally rich nation of Guyana. We first outline our framework for\ngathering and digitizing this diverse corpus, inclusive of colloquial\nexpressions, idioms, and regional variations in a low-resource language. We\nthen demonstrate the challenges of training and evaluating NLP models for\nmachine translation in Creole. Lastly, we discuss the unique opportunities\npresented by recent NLP advancements for accelerating the formal adoption of\nCreole languages as official languages in the Caribbean.\n","authors":["Christopher Clarke","Roland Daynauth","Charlene Wilkinson","Hubert Devonish","Jason Mars"],"pdf_url":"https://arxiv.org/pdf/2405.03832v3.pdf","comment":"Accepted to NAACL 2024 Main Conference Special Theme Track: Languages\n of Latin America and The Caribbean"},{"id":"http://arxiv.org/abs/2407.02662v1","updated":"2024-07-02T20:51:06Z","published":"2024-07-02T20:51:06Z","title":"Supporters and Skeptics: LLM-based Analysis of Engagement with Mental\n Health (Mis)Information Content on Video-sharing Platforms","summary":" Over one in five adults in the US lives with a mental illness. In the face of\na shortage of mental health professionals and offline resources, online\nshort-form video content has grown to serve as a crucial conduit for\ndisseminating mental health help and resources. However, the ease of content\ncreation and access also contributes to the spread of misinformation, posing\nrisks to accurate diagnosis and treatment. Detecting and understanding\nengagement with such content is crucial to mitigating their harmful effects on\npublic health. We perform the first quantitative study of the phenomenon using\nYouTube Shorts and Bitchute as the sites of study. We contribute MentalMisinfo,\na novel labeled mental health misinformation (MHMisinfo) dataset of 739 videos\n(639 from Youtube and 100 from Bitchute) and 135372 comments in total, using an\nexpert-driven annotation schema. We first found that few-shot in-context\nlearning with large language models (LLMs) are effective in detecting MHMisinfo\nvideos. Next, we discover distinct and potentially alarming linguistic patterns\nin how audiences engage with MHMisinfo videos through commentary on both\nvideo-sharing platforms. Across the two platforms, comments could exacerbate\nprevailing stigma with some groups showing heightened susceptibility to and\nalignment with MHMisinfo. We discuss technical and public health-driven\nadaptive solutions to tackling the \"epidemic\" of mental health misinformation\nonline.\n","authors":["Viet Cuong Nguyen","Mini Jain","Abhijat Chauhan","Heather Jaime Soled","Santiago Alvarez Lesmes","Zihang Li","Michael L. Birnbaum","Sunny X. Tang","Srijan Kumar","Munmun De Choudhury"],"pdf_url":"https://arxiv.org/pdf/2407.02662v1.pdf","comment":"12 pages, in submission to ICWSM"},{"id":"http://arxiv.org/abs/2407.02659v1","updated":"2024-07-02T20:49:21Z","published":"2024-07-02T20:49:21Z","title":"Ensuring Responsible Sourcing of Large Language Model Training Data\n Through Knowledge Graph Comparison","summary":" In light of recent plagiarism allegations Brough by publishers, newspapers,\nand other creators of copyrighted corpora against large language model (LLM)\ndevelopers, we propose a novel system, a variant of a plagiarism detection\nsystem, that assesses whether a knowledge source has been used in the training\nor fine-tuning of a large language model. Unlike current methods, we utilize an\napproach that uses Resource Description Framework (RDF) triples to create\nknowledge graphs from both a source document and a LLM continuation of that\ndocument. These graphs are then analyzed with respect to content using cosine\nsimilarity and with respect to structure using a normalized version of graph\nedit distance that shows the degree of isomorphism. Unlike traditional systems\nthat focus on content matching and keyword identification between a source and\ntarget corpus, our approach enables a broader evaluation of similarity and thus\na more accurate comparison of the similarity between a source document and LLM\ncontinuation by focusing on relationships between ideas and their organization\nwith regards to others. Additionally, our approach does not require access to\nLLM metrics like perplexity that may be unavailable in closed large language\nmodeling \"black-box\" systems, as well as the training corpus. A prototype of\nour system will be found on a hyperlinked GitHub repository.\n","authors":["Devam Mondal","Carlo Lipizzi"],"pdf_url":"https://arxiv.org/pdf/2407.02659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02646v1","updated":"2024-07-02T20:28:16Z","published":"2024-07-02T20:28:16Z","title":"A Practical Review of Mechanistic Interpretability for Transformer-Based\n Language Models","summary":" Mechanistic interpretability (MI) is an emerging sub-field of\ninterpretability that seeks to understand a neural network model by\nreverse-engineering its internal computations. Recently, MI has garnered\nsignificant attention for interpreting transformer-based language models (LMs),\nresulting in many novel insights yet introducing new challenges. However, there\nhas not been work that comprehensively reviews these insights and challenges,\nparticularly as a guide for newcomers to this field. To fill this gap, we\npresent a comprehensive survey outlining fundamental objects of study in MI,\ntechniques that have been used for its investigation, approaches for evaluating\nMI results, and significant findings and applications stemming from the use of\nMI to understand LMs. In particular, we present a roadmap for beginners to\nnavigate the field and leverage MI for their benefit. Finally, we also identify\ncurrent gaps in the field and discuss potential future directions.\n","authors":["Daking Rai","Yilun Zhou","Shi Feng","Abulhair Saparov","Ziyu Yao"],"pdf_url":"https://arxiv.org/pdf/2407.02646v1.pdf","comment":"11 pages, 11 figures, Preprint"},{"id":"http://arxiv.org/abs/2407.02637v1","updated":"2024-07-02T20:09:11Z","published":"2024-07-02T20:09:11Z","title":"Change My Frame: Reframing in the Wild in r/ChangeMyView","summary":" Recent work in reframing, within the scope of text style transfer, has so far\nmade use of out-of-context, task-prompted utterances in order to produce\nneutralizing or optimistic reframes. Our work aims to generalize reframing\nbased on the subreddit r/ChangeMyView (CMV). We build a dataset that leverages\nCMV's community's interactions and conventions to identify high-value,\ncommunity-recognized utterances that produce changes of perspective. With this\ndata, we widen the scope of the direction of reframing since the changes in\nperspective do not only occur in neutral or positive directions. We fine tune\ntransformer-based models, make use of a modern LLM to refine our dataset, and\nexplore challenges in the dataset creation and evaluation around this type of\nreframing.\n","authors":["Arturo Martínez Peguero","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.02637v1.pdf","comment":"3 pages, NAACL 2024 workshop"},{"id":"http://arxiv.org/abs/2406.04240v4","updated":"2024-07-02T19:51:54Z","published":"2024-06-06T16:39:00Z","title":"Hypernetworks for Personalizing ASR to Atypical Speech","summary":" Parameter-efficient fine-tuning (PEFT) for personalizing automatic speech\nrecognition (ASR) has recently shown promise for adapting general population\nmodels to atypical speech. However, these approaches assume a priori knowledge\nof the atypical speech disorder being adapted for -- the diagnosis of which\nrequires expert knowledge that is not always available. Even given this\nknowledge, data scarcity and high inter/intra-speaker variability further limit\nthe effectiveness of traditional fine-tuning. To circumvent these challenges,\nwe first identify the minimal set of model parameters required for ASR\nadaptation. Our analysis of each individual parameter's effect on adaptation\nperformance allows us to reduce Word Error Rate (WER) by half while adapting\n0.03% of all weights. Alleviating the need for cohort-specific models, we next\npropose the novel use of a meta-learned hypernetwork to generate highly\nindividualized, utterance-level adaptations on-the-fly for a diverse set of\natypical speech characteristics. Evaluating adaptation at the global, cohort\nand individual-level, we show that hypernetworks generalize better to\nout-of-distribution speakers, while maintaining an overall relative WER\nreduction of 75.2% using 0.1% of the full parameter budget.\n","authors":["Max Müller-Eberstein","Dianna Yee","Karren Yang","Gautam Varma Mantena","Colin Lea"],"pdf_url":"https://arxiv.org/pdf/2406.04240v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02631v1","updated":"2024-07-02T19:50:55Z","published":"2024-07-02T19:50:55Z","title":"Nollywood: Let's Go to the Movies!","summary":" Nollywood, based on the idea of Bollywood from India, is a series of\noutstanding movies that originate from Nigeria. Unfortunately, while the movies\nare in English, they are hard to understand for many native speakers due to the\ndialect of English that is spoken. In this article, we accomplish two goals:\n(1) create a phonetic sub-title model that is able to translate Nigerian\nEnglish speech to American English and (2) use the most advanced toxicity\ndetectors to discover how toxic the speech is. Our aim is to highlight the text\nin these videos which is often times ignored for lack of dialectal\nunderstanding due the fact that many people in Nigeria speak a native language\nlike Hausa at home.\n","authors":["John E. Ortega","Ibrahim Said Ahmad","William Chen"],"pdf_url":"https://arxiv.org/pdf/2407.02631v1.pdf","comment":"8 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.02623v1","updated":"2024-07-02T19:27:00Z","published":"2024-07-02T19:27:00Z","title":"Uplifting Lower-Income Data: Strategies for Socioeconomic Perspective\n Shifts in Vision-Language Models","summary":" To address this issue, we formulate translated non-English, geographic, and\nsocioeconomic integrated prompts and evaluate their impact on VL model\nperformance for data from different countries and income groups. Our findings\nshow that geographic and socioeconomic integrated prompts improve VL\nperformance on lower-income data and favor the retrieval of topic appearances\ncommonly found in data from low-income households. From our analyses, we\nidentify and highlight contexts where these strategies yield the most\nimprovements. Our model analysis code is publicly available at\nhttps://github.com/Anniejoan/Uplifting-Lower-income-data .\n","authors":["Joan Nwatu","Oana Ignat","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2407.02623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02604v1","updated":"2024-07-02T18:43:10Z","published":"2024-07-02T18:43:10Z","title":"D-Rax: Domain-specific Radiologic assistant leveraging multi-modal data\n and eXpert model predictions","summary":" Large vision language models (VLMs) have progressed incredibly from research\nto applicability for general-purpose use cases. LLaVA-Med, a pioneering large\nlanguage and vision assistant for biomedicine, can perform multi-modal\nbiomedical image and data analysis to provide a natural language interface for\nradiologists. While it is highly generalizable and works with multi-modal data,\nit is currently limited by well-known challenges that exist in the large\nlanguage model space. Hallucinations and imprecision in responses can lead to\nmisdiagnosis which currently hinder the clinical adaptability of VLMs. To\ncreate precise, user-friendly models in healthcare, we propose D-Rax -- a\ndomain-specific, conversational, radiologic assistance tool that can be used to\ngain insights about a particular radiologic image. In this study, we enhance\nthe conversational analysis of chest X-ray (CXR) images to support radiological\nreporting, offering comprehensive insights from medical imaging and aiding in\nthe formulation of accurate diagnosis. D-Rax is achieved by fine-tuning the\nLLaVA-Med architecture on our curated enhanced instruction-following data,\ncomprising of images, instructions, as well as disease diagnosis and\ndemographic predictions derived from MIMIC-CXR imaging data, CXR-related visual\nquestion answer (VQA) pairs, and predictive outcomes from multiple expert AI\nmodels. We observe statistically significant improvement in responses when\nevaluated for both open and close-ended conversations. Leveraging the power of\nstate-of-the-art diagnostic models combined with VLMs, D-Rax empowers\nclinicians to interact with medical images using natural language, which could\npotentially streamline their decision-making process, enhance diagnostic\naccuracy, and conserve their time.\n","authors":["Hareem Nisar","Syed Muhammad Anwar","Zhifan Jiang","Abhijeet Parida","Vishwesh Nath","Holger R. Roth","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2407.02604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02596v1","updated":"2024-07-02T18:33:49Z","published":"2024-07-02T18:33:49Z","title":"Towards More Realistic Extraction Attacks: An Adversarial Perspective","summary":" Language models are prone to memorizing large parts of their training data,\nmaking them vulnerable to extraction attacks. Existing research on these\nattacks remains limited in scope, often studying isolated trends rather than\nthe real-world interactions with these models. In this paper, we revisit\nextraction attacks from an adversarial perspective, exploiting the brittleness\nof language models. We find significant churn in extraction attack trends,\ni.e., even minor, unintuitive changes to the prompt, or targeting smaller\nmodels and older checkpoints, can exacerbate the risks of extraction by up to\n$2-4 \\times$. Moreover, relying solely on the widely accepted verbatim match\nunderestimates the extent of extracted information, and we provide various\nalternatives to more accurately capture the true risks of extraction. We\nconclude our discussion with data deduplication, a commonly suggested\nmitigation strategy, and find that while it addresses some memorization\nconcerns, it remains vulnerable to the same escalation of extraction risks\nagainst a real-world adversary. Our findings highlight the necessity of\nacknowledging an adversary's true capabilities to avoid underestimating\nextraction risks.\n","authors":["Yash More","Prakhar Ganesh","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2407.02596v1.pdf","comment":"To be presented at PrivateNLP@ACL2024"},{"id":"http://arxiv.org/abs/2402.05359v6","updated":"2024-07-02T18:18:18Z","published":"2024-02-08T02:37:30Z","title":"An Examination on the Effectiveness of Divide-and-Conquer Prompting in\n Large Language Models","summary":" Foundation models, such as Large language Models (LLMs), have attracted\nsignificant amount of interest due to their large number of applications.\nHowever, when handling tasks involving repetitive sub-tasks and/or deceptive\ncontents, such as arithmetic calculation and article-level fake news detection,\nsimple instructional prompts suffer from inaccurate responses. Existing works\nshow that more complicated prompting strategies, such as Chain-of-Thoughts and\nLeast-to-Most, can unlock LLM's powerful capacity in diverse areas. Recent\nresearches reveal that simple divide-and-conquer prompting strategy, i.e.\nsimply dividing the input sequence to multiple sub-inputs, can also\nsubstantially improve LLM's performance in some specific tasks such as\nmisinformation detection. In this paper, we aim at examining the utility of\ndivide-and-conquer prompting strategy and answer on which kind of tasks this\nstrategy gets advantages. Specifically, we provide a theoretic analysis to\ndivide-and-conquer prompting strategy and help us identify the specific tasks\nwhere DaC prompting can bring performance boost with theoretic guarantee. We\nthen present two cases (large integer arithmetic and fact verification) where\nexperimental results aligns with our theoretic analysis.\n","authors":["Yizhou Zhang","Lun Du","Defu Cao","Qiang Fu","Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2402.05359v6.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.02552v1","updated":"2024-07-02T17:42:30Z","published":"2024-07-02T17:42:30Z","title":"RLHF Can Speak Many Languages: Unlocking Multilingual Preference\n Optimization for LLMs","summary":" Preference optimization techniques have become a standard final stage for\ntraining state-of-art large language models (LLMs). However, despite widespread\nadoption, the vast majority of work to-date has focused on first-class citizen\nlanguages like English and Chinese. This captures a small fraction of the\nlanguages in the world, but also makes it unclear which aspects of current\nstate-of-the-art research transfer to a multilingual setting. In this work, we\nperform an exhaustive study to achieve a new state-of-the-art in aligning\nmultilingual LLMs. We introduce a novel, scalable method for generating\nhigh-quality multilingual feedback data to balance data coverage. We establish\nthe benefits of cross-lingual transfer and increased dataset size in preference\ntraining. Our preference-trained model achieves a 54.4% win-rate against Aya 23\n8B, the current state-of-the-art multilingual LLM in its parameter class, and a\n69.5% win-rate or higher against widely used models like Gemma-1.1-7B-it,\nLlama-3-8B-Instruct, Mistral-7B-Instruct-v0.3. As a result of our study, we\nexpand the frontier of alignment techniques to 23 languages covering half of\nthe world's population.\n","authors":["John Dang","Arash Ahmadian","Kelly Marchisio","Julia Kreutzer","Ahmet Üstün","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2407.02552v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.02489v1","updated":"2024-07-02T17:59:50Z","published":"2024-07-02T17:59:50Z","title":"Magic Insert: Style-Aware Drag-and-Drop","summary":" We present Magic Insert, a method for dragging-and-dropping subjects from a\nuser-provided image into a target image of a different style in a physically\nplausible manner while matching the style of the target image. This work\nformalizes the problem of style-aware drag-and-drop and presents a method for\ntackling it by addressing two sub-problems: style-aware personalization and\nrealistic object insertion in stylized images. For style-aware personalization,\nour method first fine-tunes a pretrained text-to-image diffusion model using\nLoRA and learned text tokens on the subject image, and then infuses it with a\nCLIP representation of the target style. For object insertion, we use\nBootstrapped Domain Adaption to adapt a domain-specific photorealistic object\ninsertion model to the domain of diverse artistic styles. Overall, the method\nsignificantly outperforms traditional approaches such as inpainting. Finally,\nwe present a dataset, SubjectPlop, to facilitate evaluation and future progress\nin this area. Project page: https://magicinsert.github.io/\n","authors":["Nataniel Ruiz","Yuanzhen Li","Neal Wadhwa","Yael Pritch","Michael Rubinstein","David E. Jacobs","Shlomi Fruchter"],"pdf_url":"https://arxiv.org/pdf/2407.02489v1.pdf","comment":"Project page: https://magicinsert.github.io/"},{"id":"http://arxiv.org/abs/2407.02484v1","updated":"2024-07-02T17:58:58Z","published":"2024-07-02T17:58:58Z","title":"Characterizing the Interpretability of Attention Maps in Digital\n Pathology","summary":" Interpreting machine learning model decisions is crucial for high-risk\napplications like healthcare. In digital pathology, large whole slide images\n(WSIs) are decomposed into smaller tiles and tile-derived features are\nprocessed by attention-based multiple instance learning (ABMIL) models to\npredict WSI-level labels. These networks generate tile-specific attention\nweights, which can be visualized as attention maps for interpretability.\nHowever, a standardized evaluation framework for these maps is lacking,\nquestioning their reliability and ability to detect spurious correlations that\ncan mislead models. We herein propose a framework to assess the ability of\nattention networks to attend to relevant features in digital pathology by\ncreating artificial model confounders and using dedicated interpretability\nmetrics. Models are trained and evaluated on data with tile modifications\ncorrelated with WSI labels, enabling the analysis of model sensitivity to\nartificial confounders and the accuracy of attention maps in highlighting them.\nConfounders are introduced either through synthetic tile modifications or\nthrough tile ablations based on their specific image-based features, with the\nlatter being used to assess more clinically relevant scenarios. We also analyze\nthe impact of varying confounder quantities at both the tile and WSI levels.\nOur results show that ABMIL models perform as desired within our framework.\nWhile attention maps generally highlight relevant regions, their robustness is\naffected by the type and number of confounders. Our versatile framework has the\npotential to be used in the evaluation of various methods and the exploration\nof image-based features driving model predictions, which could aid in biomarker\ndiscovery.\n","authors":["Tomé Albuquerque","Anil Yüce","Markus D. Herrmann","Alvaro Gomariz"],"pdf_url":"https://arxiv.org/pdf/2407.02484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02482v1","updated":"2024-07-02T17:58:07Z","published":"2024-07-02T17:58:07Z","title":"Boosting Consistency in Story Visualization with Rich-Contextual\n Conditional Diffusion Models","summary":" Recent research showcases the considerable potential of conditional diffusion\nmodels for generating consistent stories. However, current methods, which\npredominantly generate stories in an autoregressive and excessively\ncaption-dependent manner, often underrate the contextual consistency and\nrelevance of frames during sequential generation. To address this, we propose a\nnovel Rich-contextual Conditional Diffusion Models (RCDMs), a two-stage\napproach designed to enhance story generation's semantic consistency and\ntemporal consistency. Specifically, in the first stage, the frame-prior\ntransformer diffusion model is presented to predict the frame semantic\nembedding of the unknown clip by aligning the semantic correlations between the\ncaptions and frames of the known clip. The second stage establishes a robust\nmodel with rich contextual conditions, including reference images of the known\nclip, the predicted frame semantic embedding of the unknown clip, and text\nembeddings of all captions. By jointly injecting these rich contextual\nconditions at the image and feature levels, RCDMs can generate semantic and\ntemporal consistency stories. Moreover, RCDMs can generate consistent stories\nwith a single forward inference compared to autoregressive models. Our\nqualitative and quantitative results demonstrate that our proposed RCDMs\noutperform in challenging scenarios. The code and model will be available at\nhttps://github.com/muzishen/RCDMs.\n","authors":["Fei Shen","Hu Ye","Sibo Liu","Jun Zhang","Cong Wang","Xiao Han","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02477v1","updated":"2024-07-02T17:55:03Z","published":"2024-07-02T17:55:03Z","title":"Understanding Alignment in Multimodal LLMs: A Comprehensive Study","summary":" Preference alignment has become a crucial component in enhancing the\nperformance of Large Language Models (LLMs), yet its impact in Multimodal Large\nLanguage Models (MLLMs) remains comparatively underexplored. Similar to\nlanguage models, MLLMs for image understanding tasks encounter challenges like\nhallucination. In MLLMs, hallucination can occur not only by stating incorrect\nfacts but also by producing responses that are inconsistent with the image\ncontent. A primary objective of alignment for MLLMs is to encourage these\nmodels to align responses more closely with image information. Recently,\nmultiple works have introduced preference datasets for MLLMs and examined\ndifferent alignment methods, including Direct Preference Optimization (DPO) and\nProximal Policy Optimization (PPO). However, due to variations in datasets,\nbase model types, and alignment methods, it remains unclear which specific\nelements contribute most significantly to the reported improvements in these\nworks. In this paper, we independently analyze each aspect of preference\nalignment in MLLMs. We start by categorizing the alignment algorithms into two\ngroups, offline (such as DPO), and online (such as online-DPO), and show that\ncombining offline and online methods can improve the performance of the model\nin certain scenarios. We review a variety of published multimodal preference\ndatasets and discuss how the details of their construction impact model\nperformance. Based on these insights, we introduce a novel way of creating\nmultimodal preference data called Bias-Driven Hallucination Sampling (BDHS)\nthat needs neither additional annotation nor external models, and show that it\ncan achieve competitive performance to previously published alignment work for\nmultimodal models across a range of benchmarks.\n","authors":["Elmira Amirloo","Jean-Philippe Fauconnier","Christoph Roesmann","Christian Kerl","Rinu Boney","Yusu Qian","Zirui Wang","Afshin Dehghan","Yinfei Yang","Zhe Gan","Peter Grasch"],"pdf_url":"https://arxiv.org/pdf/2407.02477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14794v2","updated":"2024-07-02T17:53:43Z","published":"2024-06-20T23:51:32Z","title":"ImageFlowNet: Forecasting Multiscale Trajectories of Disease Progression\n with Irregularly-Sampled Longitudinal Medical Images","summary":" The forecasting of disease progression from images is a holy grail for\nclinical decision making. However, this task is complicated by the inherent\nhigh dimensionality, temporal sparsity and sampling irregularity in\nlongitudinal image acquisitions. Existing methods often rely on extracting\nhand-crafted features and performing time-series analysis in this vector space,\nleading to a loss of rich spatial information within the images. To overcome\nthese challenges, we introduce ImageFlowNet, a novel framework that learns\nlatent-space flow fields that evolve multiscale representations in joint\nembedding spaces using neural ODEs and SDEs to model disease progression in the\nimage domain. Notably, ImageFlowNet learns multiscale joint representation\nspaces by combining cohorts of patients together so that information can be\ntransferred between the patient samples. The dynamics then provide plausible\ntrajectories of progression, with the SDE providing alternative trajectories\nfrom the same starting point. We provide theoretical insights that support our\nformulation of ODEs, and motivate our regularizations involving high-level\nvisual features, latent space organization, and trajectory smoothness. We then\ndemonstrate ImageFlowNet's effectiveness through empirical evaluations on three\nlongitudinal medical image datasets depicting progression in retinal geographic\natrophy, multiple sclerosis, and glioblastoma.\n","authors":["Chen Liu","Ke Xu","Liangbo L. Shen","Guillaume Huguet","Zilong Wang","Alexander Tong","Danilo Bzdok","Jay Stewart","Jay C. Wang","Lucian V. Del Priore","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2406.14794v2.pdf","comment":"Included reference to codebase. Added acknowledgements"},{"id":"http://arxiv.org/abs/2311.12560v2","updated":"2024-07-02T17:51:46Z","published":"2023-11-21T12:12:19Z","title":"Benchmarking bias: Expanding clinical AI model card to incorporate bias\n reporting of social and non-social factors","summary":" Clinical AI model reporting cards should be expanded to incorporate a broad\nbias reporting of both social and non-social factors. Non-social factors\nconsider the role of other factors, such as disease dependent, anatomic, or\ninstrument factors on AI model bias, which are essential to ensure safe\ndeployment.\n","authors":["Carolina A. M. Heming","Mohamed Abdalla","Shahram Mohanna","Monish Ahluwalia","Linglin Zhang","Hari Trivedi","MinJae Woo","Benjamin Fine","Judy Wawira Gichoya","Leo Anthony Celi","Laleh Seyyed-Kalantari"],"pdf_url":"https://arxiv.org/pdf/2311.12560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02455v1","updated":"2024-07-02T17:32:34Z","published":"2024-07-02T17:32:34Z","title":"SUPER: Seated Upper Body Pose Estimation using mmWave Radars","summary":" In industrial countries, adults spend a considerable amount of time sedentary\neach day at work, driving and during activities of daily living. Characterizing\nthe seated upper body human poses using mmWave radars is an important, yet\nunder-studied topic with many applications in human-machine interaction,\ntransportation and road safety. In this work, we devise SUPER, a framework for\nseated upper body human pose estimation that utilizes dual-mmWave radars in\nclose proximity. A novel masking algorithm is proposed to coherently fuse data\nfrom the radars to generate intensity and Doppler point clouds with\ncomplementary information for high-motion but small radar cross section areas\n(e.g., upper extremities) and low-motion but large RCS areas (e.g. torso). A\nlightweight neural network extracts both global and local features of upper\nbody and output pose parameters for the Skinned Multi-Person Linear (SMPL)\nmodel. Extensive leave-one-subject-out experiments on various motion sequences\nfrom multiple subjects show that SUPER outperforms a state-of-the-art baseline\nmethod by 30 -- 184%. We also demonstrate its utility in a simple downstream\ntask for hand-object interaction.\n","authors":["Bo Zhang","Zimeng Zhou","Boyu Jiang","Rong Zheng"],"pdf_url":"https://arxiv.org/pdf/2407.02455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02445v1","updated":"2024-07-02T17:21:47Z","published":"2024-07-02T17:21:47Z","title":"Meta 3D AssetGen: Text-to-Mesh Generation with High-Quality Geometry,\n Texture, and PBR Materials","summary":" We present Meta 3D AssetGen (AssetGen), a significant advancement in\ntext-to-3D generation which produces faithful, high-quality meshes with texture\nand material control. Compared to works that bake shading in the 3D object's\nappearance, AssetGen outputs physically-based rendering (PBR) materials,\nsupporting realistic relighting. AssetGen generates first several views of the\nobject with factored shaded and albedo appearance channels, and then\nreconstructs colours, metalness and roughness in 3D, using a deferred shading\nloss for efficient supervision. It also uses a sign-distance function to\nrepresent 3D shape more reliably and introduces a corresponding loss for direct\nshape supervision. This is implemented using fused kernels for high memory\nefficiency. After mesh extraction, a texture refinement transformer operating\nin UV space significantly improves sharpness and details. AssetGen achieves 17%\nimprovement in Chamfer Distance and 40% in LPIPS over the best concurrent work\nfor few-view reconstruction, and a human preference of 72% over the best\nindustry competitors of comparable speed, including those that support PBR.\nProject page with generated assets: https://assetgen.github.io\n","authors":["Yawar Siddiqui","Tom Monnier","Filippos Kokkinos","Mahendra Kariya","Yanir Kleiman","Emilien Garreau","Oran Gafni","Natalia Neverova","Andrea Vedaldi","Roman Shapovalov","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2407.02445v1.pdf","comment":"Project Page: https://assetgen.github.io"},{"id":"http://arxiv.org/abs/2407.02439v1","updated":"2024-07-02T17:15:21Z","published":"2024-07-02T17:15:21Z","title":"Predicting Visual Attention in Graphic Design Documents","summary":" We present a model for predicting visual attention during the free viewing of\ngraphic design documents. While existing works on this topic have aimed at\npredicting static saliency of graphic designs, our work is the first attempt to\npredict both spatial attention and dynamic temporal order in which the document\nregions are fixated by gaze using a deep learning based model. We propose a\ntwo-stage model for predicting dynamic attention on such documents, with\nwebpages being our primary choice of document design for demonstration. In the\nfirst stage, we predict the saliency maps for each of the document components\n(e.g. logos, banners, texts, etc. for webpages) conditioned on the type of\ndocument layout. These component saliency maps are then jointly used to predict\nthe overall document saliency. In the second stage, we use these\nlayout-specific component saliency maps as the state representation for an\ninverse reinforcement learning model of fixation scanpath prediction during\ndocument viewing. To test our model, we collected a new dataset consisting of\neye movements from 41 people freely viewing 450 webpages (the largest dataset\nof its kind). Experimental results show that our model outperforms existing\nmodels in both saliency and scanpath prediction for webpages, and also\ngeneralizes very well to other graphic design documents such as comics,\nposters, mobile UIs, etc. and natural images.\n","authors":["Souradeep Chakraborty","Zijun Wei","Conor Kelton","Seoyoung Ahn","Aruna Balasubramanian","Gregory J. Zelinsky","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2407.02439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02437v1","updated":"2024-07-02T17:15:12Z","published":"2024-07-02T17:15:12Z","title":"Parameter Matching Attack: Enhancing Practical Applicability of\n Availability Attacks","summary":" The widespread use of personal data for training machine learning models\nraises significant privacy concerns, as individuals have limited control over\nhow their public data is subsequently utilized. Availability attacks have\nemerged as a means for data owners to safeguard their data by desning\nimperceptible perturbations that degrade model performance when incorporated\ninto training datasets. However, existing availability attacks exhibit\nlimitations in practical applicability, particularly when only a portion of the\ndata can be perturbed. To address this challenge, we propose a novel\navailability attack approach termed Parameter Matching Attack (PMA). PMA is the\nfirst availability attack that works when only a portion of data can be\nperturbed. PMA optimizes perturbations so that when the model is trained on a\nmixture of clean and perturbed data, the resulting model will approach a model\ndesigned to perform poorly. Experimental results across four datasets\ndemonstrate that PMA outperforms existing methods, achieving significant model\nperformance degradation when a part of the training data is perturbed. Our code\nis available in the supplementary.\n","authors":["Yu Zhe","Jun Sakuma"],"pdf_url":"https://arxiv.org/pdf/2407.02437v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2407.02430v1","updated":"2024-07-02T17:04:34Z","published":"2024-07-02T17:04:34Z","title":"Meta 3D TextureGen: Fast and Consistent Texture Generation for 3D\n Objects","summary":" The recent availability and adaptability of text-to-image models has sparked\na new era in many related domains that benefit from the learned text priors as\nwell as high-quality and fast generation capabilities, one of which is texture\ngeneration for 3D objects. Although recent texture generation methods achieve\nimpressive results by using text-to-image networks, the combination of global\nconsistency, quality, and speed, which is crucial for advancing texture\ngeneration to real-world applications, remains elusive. To that end, we\nintroduce Meta 3D TextureGen: a new feedforward method comprised of two\nsequential networks aimed at generating high-quality and globally consistent\ntextures for arbitrary geometries of any complexity degree in less than 20\nseconds. Our method achieves state-of-the-art results in quality and speed by\nconditioning a text-to-image model on 3D semantics in 2D space and fusing them\ninto a complete and high-resolution UV texture map, as demonstrated by\nextensive qualitative and quantitative evaluations. In addition, we introduce a\ntexture enhancement network that is capable of up-scaling any texture by an\narbitrary ratio, producing 4k pixel resolution textures.\n","authors":["Raphael Bensadoun","Yanir Kleiman","Idan Azuri","Omri Harosh","Andrea Vedaldi","Natalia Neverova","Oran Gafni"],"pdf_url":"https://arxiv.org/pdf/2407.02430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10958v2","updated":"2024-07-02T16:57:06Z","published":"2023-10-17T03:11:30Z","title":"Enhancing Deep Neural Network Training Efficiency and Performance\n through Linear Prediction","summary":" Deep neural networks (DNN) have achieved remarkable success in various\nfields, including computer vision and natural language processing. However,\ntraining an effective DNN model still poses challenges. This paper aims to\npropose a method to optimize the training effectiveness of DNN, with the goal\nof improving model performance. Firstly, based on the observation that the DNN\nparameters change in certain laws during training process, the potential of\nparameter prediction for improving model training efficiency and performance is\ndiscovered. Secondly, considering the magnitude of DNN model parameters,\nhardware limitations and characteristics of Stochastic Gradient Descent (SGD)\nfor noise tolerance, a Parameter Linear Prediction (PLP) method is exploit to\nperform DNN parameter prediction. Finally, validations are carried out on some\nrepresentative backbones. Experiment results show that compare to the normal\ntraining ways, under the same training conditions and epochs, by employing\nproposed PLP method, the optimal model is able to obtain average about 1%\naccuracy improvement and 0.01 top-1/top-5 error reduction for Vgg16, Resnet18\nand GoogLeNet based on CIFAR-100 dataset, which shown the effectiveness of the\nproposed method on different DNN structures, and validated its capacity in\nenhancing DNN training efficiency and performance.\n","authors":["Hejie Ying","Mengmeng Song","Yaohong Tang","Shungen Xiao","Zimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.10958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02422v1","updated":"2024-07-02T16:49:01Z","published":"2024-07-02T16:49:01Z","title":"Close, But Not There: Boosting Geographic Distance Sensitivity in Visual\n Place Recognition","summary":" Visual Place Recognition (VPR) plays a critical role in many localization and\nmapping pipelines. It consists of retrieving the closest sample to a query\nimage, in a certain embedding space, from a database of geotagged references.\nThe image embedding is learned to effectively describe a place despite\nvariations in visual appearance, viewpoint, and geometric changes. In this\nwork, we formulate how limitations in the Geographic Distance Sensitivity of\ncurrent VPR embeddings result in a high probability of incorrectly sorting the\ntop-k retrievals, negatively impacting the recall. In order to address this\nissue in single-stage VPR, we propose a novel mining strategy, CliqueMining,\nthat selects positive and negative examples by sampling cliques from a graph of\nvisually similar images. Our approach boosts the sensitivity of VPR embeddings\nat small distance ranges, significantly improving the state of the art on\nrelevant benchmarks. In particular, we raise recall@1 from 75% to 82% in MSLS\nChallenge, and from 76% to 90% in Nordland. Models and code are available at\nhttps://github.com/serizba/cliquemining.\n","authors":["Sergio Izquierdo","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2407.02422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02418v1","updated":"2024-07-02T16:44:00Z","published":"2024-07-02T16:44:00Z","title":"AXIAL: Attention-based eXplainability for Interpretable Alzheimer's\n Localized Diagnosis using 2D CNNs on 3D MRI brain scans","summary":" This study presents an innovative method for Alzheimer's disease diagnosis\nusing 3D MRI designed to enhance the explainability of model decisions. Our\napproach adopts a soft attention mechanism, enabling 2D CNNs to extract\nvolumetric representations. At the same time, the importance of each slice in\ndecision-making is learned, allowing the generation of a voxel-level attention\nmap to produces an explainable MRI. To test our method and ensure the\nreproducibility of our results, we chose a standardized collection of MRI data\nfrom the Alzheimer's Disease Neuroimaging Initiative (ADNI). On this dataset,\nour method significantly outperforms state-of-the-art methods in (i)\ndistinguishing AD from cognitive normal (CN) with an accuracy of 0.856 and\nMatthew's correlation coefficient (MCC) of 0.712, representing improvements of\n2.4\\% and 5.3\\% respectively over the second-best, and (ii) in the prognostic\ntask of discerning stable from progressive mild cognitive impairment (MCI) with\nan accuracy of 0.725 and MCC of 0.443, showing improvements of 10.2\\% and\n20.5\\% respectively over the second-best. We achieved this prognostic result by\nadopting a double transfer learning strategy, which enhanced sensitivity to\nmorphological changes and facilitated early-stage AD detection. With\nvoxel-level precision, our method identified which specific areas are being\npaid attention to, identifying these predominant brain regions: the\n\\emph{hippocampus}, the \\emph{amygdala}, the \\emph{parahippocampal}, and the\n\\emph{inferior lateral ventricles}. All these areas are clinically associated\nwith AD development. Furthermore, our approach consistently found the same\nAD-related areas across different cross-validation folds, proving its\nrobustness and precision in highlighting areas that align closely with known\npathological markers of the disease.\n","authors":["Gabriele Lozupone","Alessandro Bria","Francesco Fontanella","Claudio De Stefano"],"pdf_url":"https://arxiv.org/pdf/2407.02418v1.pdf","comment":"21 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2401.01456v2","updated":"2024-07-02T16:35:08Z","published":"2024-01-02T22:46:12Z","title":"ColorizeDiffusion: Adjustable Sketch Colorization with Reference Image\n and Text","summary":" Diffusion models have recently demonstrated their effectiveness in generating\nextremely high-quality images and are now utilized in a wide range of\napplications, including automatic sketch colorization. Although many methods\nhave been developed for guided sketch colorization, there has been limited\nexploration of the potential conflicts between image prompts and sketch inputs,\nwhich can lead to severe deterioration in the results. Therefore, this paper\nexhaustively investigates reference-based sketch colorization models that aim\nto colorize sketch images using reference color images. We specifically\ninvestigate two critical aspects of reference-based diffusion models: the\n\"distribution problem\", which is a major shortcoming compared to text-based\ncounterparts, and the capability in zero-shot sequential text-based\nmanipulation. We introduce two variations of an image-guided latent diffusion\nmodel utilizing different image tokens from the pre-trained CLIP image encoder\nand propose corresponding manipulation methods to adjust their results\nsequentially using weighted text inputs. We conduct comprehensive evaluations\nof our models through qualitative and quantitative experiments as well as a\nuser study.\n","authors":["Dingkun Yan","Liang Yuan","Erwin Wu","Yuma Nishioka","Issei Fujishiro","Suguru Saito"],"pdf_url":"https://arxiv.org/pdf/2401.01456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02411v1","updated":"2024-07-02T16:34:14Z","published":"2024-07-02T16:34:14Z","title":"Video Watermarking: Safeguarding Your Video from (Unauthorized)\n Annotations by Video-based LLMs","summary":" The advent of video-based Large Language Models (LLMs) has significantly\nenhanced video understanding. However, it has also raised some safety concerns\nregarding data protection, as videos can be more easily annotated, even without\nauthorization. This paper introduces Video Watermarking, a novel technique to\nprotect videos from unauthorized annotations by such video-based LLMs,\nespecially concerning the video content and description, in response to\nspecific queries. By imperceptibly embedding watermarks into key video frames\nwith multi-modal flow-based losses, our method preserves the viewing experience\nwhile preventing misuse by video-based LLMs. Extensive experiments show that\nVideo Watermarking significantly reduces the comprehensibility of videos with\nvarious video-based LLMs, demonstrating both stealth and robustness. In\nessence, our method provides a solution for securing video content, ensuring\nits integrity and confidentiality in the face of evolving video-based LLMs\ntechnologies.\n","authors":["Jinmin Li","Kuofeng Gao","Yang Bai","Jingyun Zhang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2407.02411v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2403.13507"},{"id":"http://arxiv.org/abs/2407.02405v1","updated":"2024-07-02T16:24:57Z","published":"2024-07-02T16:24:57Z","title":"Tiny-PULP-Dronets: Squeezing Neural Networks for Faster and Lighter\n Inference on Multi-Tasking Autonomous Nano-Drones","summary":" Pocket-sized autonomous nano-drones can revolutionize many robotic use cases,\nsuch as visual inspection in narrow, constrained spaces, and ensure safer\nhuman-robot interaction due to their tiny form factor and weight -- i.e., tens\nof grams. This compelling vision is challenged by the high level of\nintelligence needed aboard, which clashes against the limited computational and\nstorage resources available on PULP (parallel-ultra-low-power) MCU class\nnavigation and mission controllers that can be hosted aboard. This work moves\nfrom PULP-Dronet, a State-of-the-Art convolutional neural network for\nautonomous navigation on nano-drones. We introduce Tiny-PULP-Dronet: a novel\nmethodology to squeeze by more than one order of magnitude model size (50x\nfewer parameters), and number of operations (27x less multiply-and-accumulate)\nrequired to run inference with similar flight performance as PULP-Dronet. This\nmassive reduction paves the way towards affordable multi-tasking on\nnano-drones, a fundamental requirement for achieving high-level intelligence.\n","authors":["Lorenzo Lamberti","Vlad Niculescu","Michał Barcis","Lorenzo Bellone","Enrico Natalizio","Luca Benini","Daniele Palossi"],"pdf_url":"https://arxiv.org/pdf/2407.02405v1.pdf","comment":"3 Figures, 1 table. Accepted for publication at IEEE Artificial\n Intelligence Circuits and Systems (AICAS), 2022"},{"id":"http://arxiv.org/abs/2407.02403v1","updated":"2024-07-02T16:21:44Z","published":"2024-07-02T16:21:44Z","title":"Face Reconstruction Transfer Attack as Out-of-Distribution\n Generalization","summary":" Understanding the vulnerability of face recognition systems to malicious\nattacks is of critical importance. Previous works have focused on\nreconstructing face images that can penetrate a targeted verification system.\nEven in the white-box scenario, however, naively reconstructed images\nmisrepresent the identity information, hence the attacks are easily neutralized\nonce the face system is updated or changed. In this paper, we aim to\nreconstruct face images which are capable of transferring face attacks on\nunseen encoders. We term this problem as Face Reconstruction Transfer Attack\n(FRTA) and show that it can be formulated as an out-of-distribution (OOD)\ngeneralization problem. Inspired by its OOD nature, we propose to solve FRTA by\nAveraged Latent Search and Unsupervised Validation with pseudo target (ALSUV).\nTo strengthen the reconstruction attack on OOD unseen encoders, ALSUV\nreconstructs the face by searching the latent of amortized generator StyleGAN2\nthrough multiple latent optimization, latent optimization trajectory averaging,\nand unsupervised validation with a pseudo target. We demonstrate the efficacy\nand generalization of our method on widely used face datasets, accompanying it\nwith extensive ablation studies and visually, qualitatively, and quantitatively\nanalyses. The source code will be released.\n","authors":["Yoon Gyo Jung","Jaewoo Park","Xingbo Dong","Hojin Park","Andrew Beng Jin Teoh","Octavia Camps"],"pdf_url":"https://arxiv.org/pdf/2407.02403v1.pdf","comment":"Accepted to ECCV2024"},{"id":"http://arxiv.org/abs/2407.02398v1","updated":"2024-07-02T16:15:37Z","published":"2024-07-02T16:15:37Z","title":"Consistency Flow Matching: Defining Straight Flows with Velocity\n Consistency","summary":" Flow matching (FM) is a general framework for defining probability paths via\nOrdinary Differential Equations (ODEs) to transform between noise and data\nsamples. Recent approaches attempt to straighten these flow trajectories to\ngenerate high-quality samples with fewer function evaluations, typically\nthrough iterative rectification methods or optimal transport solutions. In this\npaper, we introduce Consistency Flow Matching (Consistency-FM), a novel FM\nmethod that explicitly enforces self-consistency in the velocity field.\nConsistency-FM directly defines straight flows starting from different times to\nthe same endpoint, imposing constraints on their velocity values. Additionally,\nwe propose a multi-segment training approach for Consistency-FM to enhance\nexpressiveness, achieving a better trade-off between sampling quality and\nspeed. Preliminary experiments demonstrate that our Consistency-FM\nsignificantly improves training efficiency by converging 4.4x faster than\nconsistency models and 1.7x faster than rectified flow models while achieving\nbetter generation quality. Our code is available at:\nhttps://github.com/YangLing0818/consistency_flow_matching\n","authors":["Ling Yang","Zixiang Zhang","Zhilong Zhang","Xingchao Liu","Minkai Xu","Wentao Zhang","Chenlin Meng","Stefano Ermon","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2407.02398v1.pdf","comment":"Code: https://github.com/YangLing0818/consistency_flow_matching"},{"id":"http://arxiv.org/abs/2407.02394v1","updated":"2024-07-02T16:12:37Z","published":"2024-07-02T16:12:37Z","title":"Similarity Distance-Based Label Assignment for Tiny Object Detection","summary":" Tiny object detection is becoming one of the most challenging tasks in\ncomputer vision because of the limited object size and lack of information. The\nlabel assignment strategy is a key factor affecting the accuracy of object\ndetection. Although there are some effective label assignment strategies for\ntiny objects, most of them focus on reducing the sensitivity to the bounding\nboxes to increase the number of positive samples and have some fixed\nhyperparameters need to set. However, more positive samples may not necessarily\nlead to better detection results, in fact, excessive positive samples may lead\nto more false positives. In this paper, we introduce a simple but effective\nstrategy named the Similarity Distance (SimD) to evaluate the similarity\nbetween bounding boxes. This proposed strategy not only considers both location\nand shape similarity but also learns hyperparameters adaptively, ensuring that\nit can adapt to different datasets and various object sizes in a dataset. Our\napproach can be simply applied in common anchor-based detectors in place of the\nIoU for label assignment and Non Maximum Suppression (NMS). Extensive\nexperiments on four mainstream tiny object detection datasets demonstrate\nsuperior performance of our method, especially, 1.8 AP points and 4.1 AP points\nof very tiny higher than the state-of-the-art competitors on AI-TOD. Code is\navailable at: \\url{https://github.com/cszzshi/SimD}.\n","authors":["Shuohao Shi","Qiang Fang","Tong Zhao","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02394v1.pdf","comment":"8 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.02392v1","updated":"2024-07-02T16:10:55Z","published":"2024-07-02T16:10:55Z","title":"TokenPacker: Efficient Visual Projector for Multimodal LLM","summary":" The visual projector serves as an essential bridge between the visual encoder\nand the Large Language Model (LLM) in a Multimodal LLM (MLLM). Typically, MLLMs\nadopt a simple MLP to preserve all visual contexts via one-to-one\ntransformation. However, the visual tokens are redundant and can be\nconsiderably increased when dealing with high-resolution images, impairing the\nefficiency of MLLMs significantly. Some recent works have introduced resampler\nor abstractor to reduce the number of resulting visual tokens. Unfortunately,\nthey fail to capture finer details and undermine the visual reasoning\ncapabilities of MLLMs. In this work, we propose a novel visual projector, which\nadopts a coarse-to-fine scheme to inject the enriched characteristics to\ngenerate the condensed visual tokens. In specific, we first interpolate the\nvisual features as a low-resolution point query, providing the overall visual\nrepresentation as the foundation. Then, we introduce a region-to-point\ninjection module that utilizes high-resolution, multi-level region-based cues\nas fine-grained reference keys and values, allowing them to be fully absorbed\nwithin the corresponding local context region. This step effectively updates\nthe coarse point query, transforming it into an enriched one for the subsequent\nLLM reasoning. Extensive experiments demonstrate that our approach compresses\nthe visual tokens by 75%~89%, while achieves comparable or even better\nperformance across diverse benchmarks with significantly higher efficiency. The\nsource codes can be found at https://github.com/CircleRadon/TokenPacker.\n","authors":["Wentong Li","Yuqian Yuan","Jian Liu","Dongqi Tang","Song Wang","Jianke Zhu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02392v1.pdf","comment":"16 pages, Codes:https://github.com/CircleRadon/TokenPacker"},{"id":"http://arxiv.org/abs/2403.15603v2","updated":"2024-07-02T16:05:48Z","published":"2024-03-22T20:11:19Z","title":"Forward Learning for Gradient-based Black-box Saliency Map Generation","summary":" Gradient-based saliency maps are widely used to explain deep neural network\ndecisions. However, as models become deeper and more black-box, such as in\nclosed-source APIs like ChatGPT, computing gradients become challenging,\nhindering conventional explanation methods. In this work, we introduce a novel\nunified framework for estimating gradients in black-box settings and generating\nsaliency maps to interpret model decisions. We employ the likelihood ratio\nmethod to estimate output-to-input gradients and utilize them for saliency map\ngeneration. Additionally, we propose blockwise computation techniques to\nenhance estimation accuracy. Extensive experiments in black-box settings\nvalidate the effectiveness of our method, demonstrating accurate gradient\nestimation and explainability of generated saliency maps. Furthermore, we\nshowcase the scalability of our approach by applying it to explain GPT-Vision,\nrevealing the continued relevance of gradient-based explanation methods in the\nera of large, closed-source, and black-box models.\n","authors":["Zeliang Zhang","Mingqian Feng","Jinyang Jiang","Rongyi Zhu","Yijie Peng","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.15603v2.pdf","comment":"The evaluation is based on small datasets and limited models, of\n which bias leads to misleading conclusions"},{"id":"http://arxiv.org/abs/2309.14277v3","updated":"2024-07-02T16:02:39Z","published":"2023-09-25T16:40:56Z","title":"SINCERE: Supervised Information Noise-Contrastive Estimation REvisited","summary":" The information noise-contrastive estimation (InfoNCE) loss function provides\nthe basis of many self-supervised deep learning methods due to its strong\nempirical results and theoretic motivation. Previous work suggests a supervised\ncontrastive (SupCon) loss to extend InfoNCE to learn from available class\nlabels. This SupCon loss has been widely-used due to reports of good empirical\nperformance. However, in this work we find that the prior SupCon loss\nformulation has questionable justification because it can encourage some images\nfrom the same class to repel one another in the learned embedding space. This\nproblematic intra-class repulsion gets worse as the number of images sharing\none class label increases. We propose the Supervised InfoNCE REvisited\n(SINCERE) loss as a theoretically-justified supervised extension of InfoNCE\nthat eliminates intra-class repulsion. Experiments show that SINCERE leads to\nbetter separation of embeddings from different classes and improves transfer\nlearning classification accuracy. We additionally utilize probabilistic\nmodeling to derive an information-theoretic bound that relates SINCERE loss to\nthe symmeterized KL divergence between data-generating distributions for a\ntarget class and all other classes.\n","authors":["Patrick Feeney","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2309.14277v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02389v1","updated":"2024-07-02T16:02:25Z","published":"2024-07-02T16:02:25Z","title":"SafaRi:Adaptive Sequence Transformer for Weakly Supervised Referring\n Expression Segmentation","summary":" Referring Expression Segmentation (RES) aims to provide a segmentation mask\nof the target object in an image referred to by the text (i.e., referring\nexpression). Existing methods require large-scale mask annotations. Moreover,\nsuch approaches do not generalize well to unseen/zero-shot scenarios. To\naddress the aforementioned issues, we propose a weakly-supervised bootstrapping\narchitecture for RES with several new algorithmic innovations. To the best of\nour knowledge, ours is the first approach that considers only a fraction of\nboth mask and box annotations (shown in Figure 1 and Table 1) for training. To\nenable principled training of models in such low-annotation settings, improve\nimage-text region-level alignment, and further enhance spatial localization of\nthe target object in the image, we propose Cross-modal Fusion with Attention\nConsistency module. For automatic pseudo-labeling of unlabeled samples, we\nintroduce a novel Mask Validity Filtering routine based on a spatially aware\nzero-shot proposal scoring approach. Extensive experiments show that with just\n30% annotations, our model SafaRi achieves 59.31 and 48.26 mIoUs as compared to\n58.93 and 48.19 mIoUs obtained by the fully-supervised SOTA method SeqTR\nrespectively on RefCOCO+@testA and RefCOCO+testB datasets. SafaRi also\noutperforms SeqTR by 11.7% (on RefCOCO+testA) and 19.6% (on RefCOCO+testB) in a\nfully-supervised setting and demonstrates strong generalization capabilities in\nunseen/zero-shot tasks.\n","authors":["Sayan Nag","Koustava Goswami","Srikrishna Karanam"],"pdf_url":"https://arxiv.org/pdf/2407.02389v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2407.02387v1","updated":"2024-07-02T16:01:16Z","published":"2024-07-02T16:01:16Z","title":"Real HSI-MSI-PAN image dataset for the\n hyperspectral/multi-spectral/panchromatic image fusion and super-resolution\n fields","summary":" Nowadays, most of the hyperspectral image (HSI) fusion experiments are based\non simulated datasets to compare different fusion methods. However, most of the\nspectral response functions and spatial downsampling functions used to create\nthe simulated datasets are not entirely accurate, resulting in deviations in\nspatial and spectral features between the generated images for fusion and the\nreal images for fusion. This reduces the credibility of the fusion algorithm,\ncausing unfairness in the comparison between different algorithms and hindering\nthe development of the field of hyperspectral image fusion. Therefore, we\nrelease a real HSI/MSI/PAN image dataset to promote the development of the\nfield of hyperspectral image fusion. These three images are spatially\nregistered, meaning fusion can be performed between HSI and MSI, HSI and PAN\nimage, MSI and PAN image, as well as among HSI, MSI, and PAN image. This real\ndataset could be available at https://aistudio.baidu.com/datasetdetail/281612.\nThe related code to process the data could be available at\nhttps://github.com/rs-lsl/CSSNet.\n","authors":["Shuangliang Li"],"pdf_url":"https://arxiv.org/pdf/2407.02387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02386v1","updated":"2024-07-02T16:00:55Z","published":"2024-07-02T16:00:55Z","title":"OpenSlot: Mixed Open-set Recognition with Object-centric Learning","summary":" Existing open-set recognition (OSR) studies typically assume that each image\ncontains only one class label, and the unknown test set (negative) has a\ndisjoint label space from the known test set (positive), a scenario termed\nfull-label shift. This paper introduces the mixed OSR problem, where test\nimages contain multiple class semantics, with known and unknown classes\nco-occurring in negatives, leading to a more challenging super-label shift.\nAddressing the mixed OSR requires classification models to accurately\ndistinguish different class semantics within images and measure their\n\"knowness\". In this study, we propose the OpenSlot framework, built upon\nobject-centric learning. OpenSlot utilizes slot features to represent diverse\nclass semantics and produce class predictions. Through our proposed\nanti-noise-slot (ANS) technique, we mitigate the impact of noise (invalid and\nbackground) slots during classification training, effectively addressing the\nsemantic misalignment between class predictions and the ground truth. We\nconduct extensive experiments with OpenSlot on mixed & conventional OSR\nbenchmarks. Without elaborate designs, OpenSlot not only exceeds existing OSR\nstudies in detecting super-label shifts across single & multi-label mixed OSR\ntasks but also achieves state-of-the-art performance on conventional\nbenchmarks. Remarkably, our method can localize class objects without using\nbounding boxes during training. The competitive performance in open-set object\ndetection demonstrates OpenSlot's ability to explicitly explain label shifts\nand benefits in computational efficiency and generalization.\n","authors":["Xu Yin","Fei Pan","Guoyuan An","Yuchi Huo","Zixuan Xie","Sung-Eui Yoon"],"pdf_url":"https://arxiv.org/pdf/2407.02386v1.pdf","comment":"This study is under IEEE TMM review"},{"id":"http://arxiv.org/abs/2404.01241v3","updated":"2024-07-02T15:53:03Z","published":"2024-04-01T17:00:18Z","title":"StructLDM: Structured Latent Diffusion for 3D Human Generation","summary":" Recent 3D human generative models have achieved remarkable progress by\nlearning 3D-aware GANs from 2D images. However, existing 3D human generative\nmethods model humans in a compact 1D latent space, ignoring the articulated\nstructure and semantics of human body topology. In this paper, we explore more\nexpressive and higher-dimensional latent space for 3D human modeling and\npropose StructLDM, a diffusion-based unconditional 3D human generative model,\nwhich is learned from 2D images. StructLDM solves the challenges imposed due to\nthe high-dimensional growth of latent space with three key designs: 1) A\nsemantic structured latent space defined on the dense surface manifold of a\nstatistical human body template. 2) A structured 3D-aware auto-decoder that\nfactorizes the global latent space into several semantic body parts\nparameterized by a set of conditional structured local NeRFs anchored to the\nbody template, which embeds the properties learned from the 2D training data\nand can be decoded to render view-consistent humans under different poses and\nclothing styles. 3) A structured latent diffusion model for generative human\nappearance sampling. Extensive experiments validate StructLDM's\nstate-of-the-art generation performance and illustrate the expressiveness of\nthe structured latent space over the well-adopted 1D latent space. Notably,\nStructLDM enables different levels of controllable 3D human generation and\nediting, including pose/view/shape control, and high-level tasks including\ncompositional generations, part-aware clothing editing, 3D virtual try-on, etc.\nOur project page is at: https://taohuumd.github.io/projects/StructLDM/.\n","authors":["Tao Hu","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2404.01241v3.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://taohuumd.github.io/projects/StructLDM/"},{"id":"http://arxiv.org/abs/2404.06025v2","updated":"2024-07-02T15:48:49Z","published":"2024-04-09T05:21:32Z","title":"Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs","summary":" Morphing attacks are an emerging threat to state-of-the-art Face Recognition\n(FR) systems, which aim to create a single image that contains the biometric\ninformation of multiple identities. Diffusion Morphs (DiM) are a recently\nproposed morphing attack that has achieved state-of-the-art performance for\nrepresentation-based morphing attacks. However, none of the existing research\non DiMs have leveraged the iterative nature of DiMs and left the DiM model as a\nblack box, treating it no differently than one would a Generative Adversarial\nNetwork (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on\nthe iterative sampling process of DiM models which searches for an optimal step\nguided by an identity-based heuristic function. We compare our proposed\nalgorithm against ten other state-of-the-art morphing algorithms using the\nopen-source SYN-MAD 2022 competition dataset. We find that our proposed\nalgorithm is unreasonably effective, fooling all of the tested FR systems with\nan MMPMR of 100%, outperforming all other morphing algorithms compared.\n","authors":["Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06025v2.pdf","comment":"Accepted as a conference paper at IJCB 2024"},{"id":"http://arxiv.org/abs/2401.15741v7","updated":"2024-07-02T15:48:30Z","published":"2024-01-28T19:58:19Z","title":"SERNet-Former: Semantic Segmentation by Efficient Residual Network with\n Attention-Boosting Gates and Attention-Fusion Networks","summary":" Improving the efficiency of state-of-the-art methods in semantic segmentation\nrequires overcoming the increasing computational cost as well as issues such as\nfusing semantic information from global and local contexts. Based on the recent\nsuccess and problems that convolutional neural networks (CNNs) encounter in\nsemantic segmentation, this research proposes an encoder-decoder architecture\nwith a unique efficient residual network, Efficient-ResNet. Attention-boosting\ngates (AbGs) and attention-boosting modules (AbMs) are deployed by aiming to\nfuse the equivariant and feature-based semantic information with the equivalent\nsizes of the output of global context of the efficient residual network in the\nencoder. Respectively, the decoder network is developed with the additional\nattention-fusion networks (AfNs) inspired by AbM. AfNs are designed to improve\nthe efficiency in the one-to-one conversion of the semantic information by\ndeploying additional convolution layers in the decoder part. Our network is\ntested on the challenging CamVid and Cityscapes datasets, and the proposed\nmethods reveal significant improvements on the residual networks. To the best\nof our knowledge, the developed network, SERNet-Former, achieves\nstate-of-the-art results (84.62 % mean IoU) on CamVid dataset and challenging\nresults (87.35 % mean IoU) on Cityscapes validation dataset.\n","authors":["Serdar Erisen"],"pdf_url":"https://arxiv.org/pdf/2401.15741v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01530v2","updated":"2024-07-02T15:45:25Z","published":"2024-07-01T17:59:54Z","title":"xLSTM-UNet can be an Effective 2D & 3D Medical Image Segmentation\n Backbone with Vision-LSTM (ViL) better than its Mamba Counterpart","summary":" Convolutional Neural Networks (CNNs) and Vision Transformers (ViT) have been\npivotal in biomedical image segmentation, yet their ability to manage\nlong-range dependencies remains constrained by inherent locality and\ncomputational overhead. To overcome these challenges, in this technical report,\nwe first propose xLSTM-UNet, a UNet structured deep learning neural network\nthat leverages Vision-LSTM (xLSTM) as its backbone for medical image\nsegmentation. xLSTM is a recently proposed as the successor of Long Short-Term\nMemory (LSTM) networks and have demonstrated superior performance compared to\nTransformers and State Space Models (SSMs) like Mamba in Neural Language\nProcessing (NLP) and image classification (as demonstrated in Vision-LSTM, or\nViL implementation). Here, xLSTM-UNet we designed extend the success in\nbiomedical image segmentation domain. By integrating the local feature\nextraction strengths of convolutional layers with the long-range dependency\ncapturing abilities of xLSTM, xLSTM-UNet offers a robust solution for\ncomprehensive image analysis. We validate the efficacy of xLSTM-UNet through\nexperiments. Our findings demonstrate that xLSTM-UNet consistently surpasses\nthe performance of leading CNN-based, Transformer-based, and Mamba-based\nsegmentation networks in multiple datasets in biomedical segmentation including\norgans in abdomen MRI, instruments in endoscopic images, and cells in\nmicroscopic images. With comprehensive experiments performed, this technical\nreport highlights the potential of xLSTM-based architectures in advancing\nbiomedical image analysis in both 2D and 3D. The code, models, and datasets are\npublicly available at http://tianrun-chen.github.io/xLSTM-UNet/\n","authors":["Tianrun Chen","Chaotao Ding","Lanyun Zhu","Tao Xu","Deyi Ji","Yan Wang","Ying Zang","Zejian Li"],"pdf_url":"https://arxiv.org/pdf/2407.01530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.08388v2","updated":"2024-07-02T15:41:55Z","published":"2022-01-20T06:09:56Z","title":"Steerable Pyramid Transform Enables Robust Left Ventricle Quantification","summary":" Predicting cardiac indices has long been a focal point in the medical imaging\ncommunity. While various deep learning models have demonstrated success in\nquantifying cardiac indices, they remain susceptible to mild input\nperturbations, e.g., spatial transformations, image distortions, and\nadversarial attacks. This vulnerability undermines confidence in using\nlearning-based automated systems for diagnosing cardiovascular diseases. In\nthis work, we describe a simple yet effective method to learn robust models for\nleft ventricle (LV) quantification, encompassing cavity and myocardium areas,\ndirectional dimensions, and regional wall thicknesses. Our success hinges on\nemploying the biologically inspired steerable pyramid transform (SPT) for fixed\nfront-end processing, which offers three main benefits. First, the basis\nfunctions of SPT align with the anatomical structure of LV and the geometric\nfeatures of the measured indices. Second, SPT facilitates weight sharing across\ndifferent orientations as a form of parameter regularization and naturally\ncaptures the scale variations of LV. Third, the residual highpass subband can\nbe conveniently discarded, promoting robust feature learning. Extensive\nexperiments on the Cardiac-Dig benchmark show that our SPT-augmented model not\nonly achieves reasonable prediction accuracy compared to state-of-the-art\nmethods, but also exhibits significantly improved robustness against input\nperturbations.\n","authors":["Xiangyang Zhu","Kede Ma","Wufeng Xue"],"pdf_url":"https://arxiv.org/pdf/2201.08388v2.pdf","comment":"Code is available at https://github.com/yangyangyang127/RobustLV"},{"id":"http://arxiv.org/abs/2407.02371v1","updated":"2024-07-02T15:40:29Z","published":"2024-07-02T15:40:29Z","title":"OpenVid-1M: A Large-Scale High-Quality Dataset for Text-to-video\n Generation","summary":" Text-to-video (T2V) generation has recently garnered significant attention\nthanks to the large multi-modality model Sora. However, T2V generation still\nfaces two important challenges: 1) Lacking a precise open sourced high-quality\ndataset. The previous popular video datasets, e.g. WebVid-10M and Panda-70M,\nare either with low quality or too large for most research institutions.\nTherefore, it is challenging but crucial to collect a precise high-quality\ntext-video pairs for T2V generation. 2) Ignoring to fully utilize textual\ninformation. Recent T2V methods have focused on vision transformers, using a\nsimple cross attention module for video generation, which falls short of\nthoroughly extracting semantic information from text prompt. To address these\nissues, we introduce OpenVid-1M, a precise high-quality dataset with expressive\ncaptions. This open-scenario dataset contains over 1 million text-video pairs,\nfacilitating research on T2V generation. Furthermore, we curate 433K 1080p\nvideos from OpenVid-1M to create OpenVidHD-0.4M, advancing high-definition\nvideo generation. Additionally, we propose a novel Multi-modal Video Diffusion\nTransformer (MVDiT) capable of mining both structure information from visual\ntokens and semantic information from text tokens. Extensive experiments and\nablation studies verify the superiority of OpenVid-1M over previous datasets\nand the effectiveness of our MVDiT.\n","authors":["Kepan Nan","Rui Xie","Penghao Zhou","Tiehan Fan","Zhenheng Yang","Zhijie Chen","Xiang Li","Jian Yang","Ying Tai"],"pdf_url":"https://arxiv.org/pdf/2407.02371v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.01392v2","updated":"2024-07-02T15:39:29Z","published":"2024-07-01T15:43:25Z","title":"Diffusion Forcing: Next-token Prediction Meets Full-Sequence Diffusion","summary":" This paper presents Diffusion Forcing, a new training paradigm where a\ndiffusion model is trained to denoise a set of tokens with independent\nper-token noise levels. We apply Diffusion Forcing to sequence generative\nmodeling by training a causal next-token prediction model to generate one or\nseveral future tokens without fully diffusing past ones. Our approach is shown\nto combine the strengths of next-token prediction models, such as\nvariable-length generation, with the strengths of full-sequence diffusion\nmodels, such as the ability to guide sampling to desirable trajectories. Our\nmethod offers a range of additional capabilities, such as (1) rolling-out\nsequences of continuous tokens, such as video, with lengths past the training\nhorizon, where baselines diverge and (2) new sampling and guiding schemes that\nuniquely profit from Diffusion Forcing's variable-horizon and causal\narchitecture, and which lead to marked performance gains in decision-making and\nplanning tasks. In addition to its empirical success, our method is proven to\noptimize a variational lower bound on the likelihoods of all subsequences of\ntokens drawn from the true joint distribution. Project website:\nhttps://boyuan.space/diffusion-forcing/\n","authors":["Boyuan Chen","Diego Marti Monso","Yilun Du","Max Simchowitz","Russ Tedrake","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2407.01392v2.pdf","comment":"Project website: https://boyuan.space/diffusion-forcing/"},{"id":"http://arxiv.org/abs/2407.02370v1","updated":"2024-07-02T15:39:08Z","published":"2024-07-02T15:39:08Z","title":"Investigating Event-Based Cameras for Video Frame Interpolation in\n Sports","summary":" Slow-motion replays provide a thrilling perspective on pivotal moments within\nsports games, offering a fresh and captivating visual experience. However,\ncapturing slow-motion footage typically demands high-tech, expensive cameras\nand infrastructures. Deep learning Video Frame Interpolation (VFI) techniques\nhave emerged as a promising avenue, capable of generating high-speed footage\nfrom regular camera feeds. Moreover, the utilization of event-based cameras has\nrecently gathered attention as they provide valuable motion information between\nframes, further enhancing the VFI performances. In this work, we present a\nfirst investigation of event-based VFI models for generating sports slow-motion\nvideos. Particularly, we design and implement a bi-camera recording setup,\nincluding an RGB and an event-based camera to capture sports videos, to\ntemporally align and spatially register both cameras. Our experimental\nvalidation demonstrates that TimeLens, an off-the-shelf event-based VFI model,\ncan effectively generate slow-motion footage for sports videos. This first\ninvestigation underscores the practical utility of event-based cameras in\nproducing sports slow-motion content and lays the groundwork for future\nresearch endeavors in this domain.\n","authors":["Antoine Deckyvere","Anthony Cioppa","Silvio Giancola","Bernard Ghanem","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2407.02370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19392v2","updated":"2024-07-02T15:30:03Z","published":"2024-06-27T17:59:45Z","title":"ReXTime: A Benchmark Suite for Reasoning-Across-Time in Videos","summary":" We introduce ReXTime, a benchmark designed to rigorously test AI models'\nability to perform temporal reasoning within video events. Specifically,\nReXTime focuses on reasoning across time, i.e. human-like understanding when\nthe question and its corresponding answer occur in different video segments.\nThis form of reasoning, requiring advanced understanding of cause-and-effect\nrelationships across video segments, poses significant challenges to even the\nfrontier multimodal large language models. To facilitate this evaluation, we\ndevelop an automated pipeline for generating temporal reasoning question-answer\npairs, significantly reducing the need for labor-intensive manual annotations.\nOur benchmark includes 921 carefully vetted validation samples and 2,143 test\nsamples, each manually curated for accuracy and relevance. Evaluation results\nshow that while frontier large language models outperform academic models, they\nstill lag behind human performance by a significant 14.3% accuracy gap.\nAdditionally, our pipeline creates a training dataset of 9,695 machine\ngenerated samples without manual effort, which empirical studies suggest can\nenhance the across-time reasoning via fine-tuning.\n","authors":["Jr-Jen Chen","Yu-Chien Liao","Hsi-Che Lin","Yu-Chu Yu","Yen-Chun Chen","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19392v2.pdf","comment":"Project page: https://rextime.github.io/"},{"id":"http://arxiv.org/abs/2407.02361v1","updated":"2024-07-02T15:27:33Z","published":"2024-07-02T15:27:33Z","title":"GCF: Graph Convolutional Networks for Facial Expression Recognition","summary":" Facial Expression Recognition (FER) is vital for understanding interpersonal\ncommunication. However, existing classification methods often face challenges\nsuch as vulnerability to noise, imbalanced datasets, overfitting, and\ngeneralization issues. In this paper, we propose GCF, a novel approach that\nutilizes Graph Convolutional Networks for FER. GCF integrates Convolutional\nNeural Networks (CNNs) for feature extraction, using either custom\narchitectures or pretrained models. The extracted visual features are then\nrepresented on a graph, enhancing local CNN features with global features via a\nGraph Convolutional Neural Network layer. We evaluate GCF on benchmark datasets\nincluding CK+, JAFFE, and FERG. The results show that GCF significantly\nimproves performance over state-of-the-art methods. For example, GCF enhances\nthe accuracy of ResNet18 from 92% to 98% on CK+, from 66% to 89% on JAFFE, and\nfrom 94% to 100% on FERG. Similarly, GCF improves the accuracy of VGG16 from\n89% to 97% on CK+, from 72% to 92% on JAFFE, and from 96% to 99.49% on FERG. We\nprovide a comprehensive analysis of our approach, demonstrating its\neffectiveness in capturing nuanced facial expressions. By integrating graph\nconvolutions with CNNs, GCF significantly advances FER, offering improved\naccuracy and robustness in real-world applications.\n","authors":["Hozaifa Kassab","Mohamed Bahaa","Ali Hamdi"],"pdf_url":"https://arxiv.org/pdf/2407.02361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07541v3","updated":"2024-07-02T15:26:28Z","published":"2023-12-12T18:59:40Z","title":"SMERF: Streamable Memory Efficient Radiance Fields for Real-Time\n Large-Scene Exploration","summary":" Recent techniques for real-time view synthesis have rapidly advanced in\nfidelity and speed, and modern methods are capable of rendering\nnear-photorealistic scenes at interactive frame rates. At the same time, a\ntension has arisen between explicit scene representations amenable to\nrasterization and neural fields built on ray marching, with state-of-the-art\ninstances of the latter surpassing the former in quality while being\nprohibitively expensive for real-time applications. In this work, we introduce\nSMERF, a view synthesis approach that achieves state-of-the-art accuracy among\nreal-time methods on large scenes with footprints up to 300 m$^2$ at a\nvolumetric resolution of 3.5 mm$^3$. Our method is built upon two primary\ncontributions: a hierarchical model partitioning scheme, which increases model\ncapacity while constraining compute and memory consumption, and a distillation\ntraining strategy that simultaneously yields high fidelity and internal\nconsistency. Our approach enables full six degrees of freedom (6DOF) navigation\nwithin a web browser and renders in real-time on commodity smartphones and\nlaptops. Extensive experiments show that our method exceeds the current\nstate-of-the-art in real-time novel view synthesis by 0.78 dB on standard\nbenchmarks and 1.78 dB on large scenes, renders frames three orders of\nmagnitude faster than state-of-the-art radiance field models, and achieves\nreal-time performance across a wide variety of commodity devices, including\nsmartphones. We encourage readers to explore these models interactively at our\nproject website: https://smerf-3d.github.io.\n","authors":["Daniel Duckworth","Peter Hedman","Christian Reiser","Peter Zhizhin","Jean-François Thibert","Mario Lučić","Richard Szeliski","Jonathan T. Barron"],"pdf_url":"https://arxiv.org/pdf/2312.07541v3.pdf","comment":"Camera Ready. Project website: https://smerf-3d.github.io"},{"id":"http://arxiv.org/abs/2407.02356v1","updated":"2024-07-02T15:21:11Z","published":"2024-07-02T15:21:11Z","title":"Enable the Right to be Forgotten with Federated Client Unlearning in\n Medical Imaging","summary":" The right to be forgotten, as stated in most data regulations, poses an\nunderexplored challenge in federated learning (FL), leading to the development\nof federated unlearning (FU). However, current FU approaches often face\ntrade-offs between efficiency, model performance, forgetting efficacy, and\nprivacy preservation. In this paper, we delve into the paradigm of Federated\nClient Unlearning (FCU) to guarantee a client the right to erase the\ncontribution or the influence, introducing the first FU framework in medical\nimaging. In the unlearning process of a client, the proposed model-contrastive\nunlearning marks a pioneering step towards feature-level unlearning, and\nfrequency-guided memory preservation ensures smooth forgetting of local\nknowledge while maintaining the generalizability of the trained global model,\nthus avoiding performance compromises and guaranteeing rapid post-training. We\nevaluated our FCU framework on two public medical image datasets, including\nIntracranial hemorrhage diagnosis and skin lesion diagnosis, demonstrating that\nour framework outperformed other state-of-the-art FU frameworks, with an\nexpected speed-up of 10-15 times compared with retraining from scratch. The\ncode and the organized datasets can be found at:\nhttps://github.com/dzp2095/FCU.\n","authors":["Zhipeng Deng","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2407.02356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02350v1","updated":"2024-07-02T15:16:06Z","published":"2024-07-02T15:16:06Z","title":"Conceptual Codebook Learning for Vision-Language Models","summary":" In this paper, we propose Conceptual Codebook Learning (CoCoLe), a novel\nfine-tuning method for vision-language models (VLMs) to address the challenge\nof improving the generalization capability of VLMs while fine-tuning them on\ndownstream tasks in a few-shot setting. We recognize that visual concepts, such\nas textures, shapes, and colors are naturally transferable across domains and\nplay a crucial role in generalization tasks. Motivated by this interesting\nfinding, we learn a conceptual codebook consisting of visual concepts as keys\nand conceptual prompts as values, which serves as a link between the image\nencoder's outputs and the text encoder's inputs. Specifically, for a given\nimage, we leverage the codebook to identify the most relevant conceptual\nprompts associated with the class embeddings to perform the classification.\nAdditionally, we incorporate a handcrafted concept cache as a regularization to\nalleviate the overfitting issues in low-shot scenarios. We observe that this\nconceptual codebook learning method is able to achieve enhanced alignment\nbetween visual and linguistic modalities. Extensive experimental results\ndemonstrate that our CoCoLe method remarkably outperforms the existing\nstate-of-the-art methods across various evaluation settings, including\nbase-to-new generalization, cross-dataset evaluation, and domain generalization\ntasks. Detailed ablation studies further confirm the efficacy of each component\nin CoCoLe.\n","authors":["Yi Zhang","Ke Yu","Siqi Wu","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2407.02350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20436v3","updated":"2024-07-02T15:10:28Z","published":"2023-10-31T13:15:49Z","title":"SignAvatars: A Large-scale 3D Sign Language Holistic Motion Dataset and\n Benchmark","summary":" We present SignAvatars, the first large-scale, multi-prompt 3D sign language\n(SL) motion dataset designed to bridge the communication gap for Deaf and\nhard-of-hearing individuals. While there has been an exponentially growing\nnumber of research regarding digital communication, the majority of existing\ncommunication technologies primarily cater to spoken or written languages,\ninstead of SL, the essential communication method for Deaf and hard-of-hearing\ncommunities. Existing SL datasets, dictionaries, and sign language production\n(SLP) methods are typically limited to 2D as annotating 3D models and avatars\nfor SL is usually an entirely manual and labor-intensive process conducted by\nSL experts, often resulting in unnatural avatars. In response to these\nchallenges, we compile and curate the SignAvatars dataset, which comprises\n70,000 videos from 153 signers, totaling 8.34 million frames, covering both\nisolated signs and continuous, co-articulated signs, with multiple prompts\nincluding HamNoSys, spoken language, and words. To yield 3D holistic\nannotations, including meshes and biomechanically-valid poses of body, hands,\nand face, as well as 2D and 3D keypoints, we introduce an automated annotation\npipeline operating on our large corpus of SL videos. SignAvatars facilitates\nvarious tasks such as 3D sign language recognition (SLR) and the novel 3D SL\nproduction (SLP) from diverse inputs like text scripts, individual words, and\nHamNoSys notation. Hence, to evaluate the potential of SignAvatars, we further\npropose a unified benchmark of 3D SL holistic motion production. We believe\nthat this work is a significant step forward towards bringing the digital world\nto the Deaf and hard-of-hearing communities as well as people interacting with\nthem.\n","authors":["Zhengdi Yu","Shaoli Huang","Yongkang Cheng","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2310.20436v3.pdf","comment":"ECCV2024 14 pages; Project page available at\n https://signavatars.github.io/"},{"id":"http://arxiv.org/abs/2407.02335v1","updated":"2024-07-02T15:05:19Z","published":"2024-07-02T15:05:19Z","title":"CALICO: Confident Active Learning with Integrated Calibration","summary":" The growing use of deep learning in safety-critical applications, such as\nmedical imaging, has raised concerns about limited labeled data, where this\ndemand is amplified as model complexity increases, posing hurdles for domain\nexperts to annotate data. In response to this, active learning (AL) is used to\nefficiently train models with limited annotation costs. In the context of deep\nneural networks (DNNs), AL often uses confidence or probability outputs as a\nscore for selecting the most informative samples. However, modern DNNs exhibit\nunreliable confidence outputs, making calibration essential. We propose an AL\nframework that self-calibrates the confidence used for sample selection during\nthe training process, referred to as Confident Active Learning with Integrated\nCalibratiOn (CALICO). CALICO incorporates the joint training of a classifier\nand an energy-based model, instead of the standard softmax-based classifier.\nThis approach allows for simultaneous estimation of the input data distribution\nand the class probabilities during training, improving calibration without\nneeding an additional labeled dataset. Experimental results showcase improved\nclassification performance compared to a softmax-based classifier with fewer\nlabeled samples. Furthermore, the calibration stability of the model is\nobserved to depend on the prior class distribution of the data.\n","authors":["Lorenzo S. Querol","Hajime Nagahara","Hideaki Hayashi"],"pdf_url":"https://arxiv.org/pdf/2407.02335v1.pdf","comment":"Accepted to ICANN2024"},{"id":"http://arxiv.org/abs/2407.02333v1","updated":"2024-07-02T15:01:55Z","published":"2024-07-02T15:01:55Z","title":"Why do LLaVA Vision-Language Models Reply to Images in English?","summary":" We uncover a surprising multilingual bias occurring in a popular class of\nmultimodal vision-language models (VLMs). Including an image in the query to a\nLLaVA-style VLM significantly increases the likelihood of the model returning\nan English response, regardless of the language of the query. This paper\ninvestigates the causes of this loss with a two-pronged approach that combines\nextensive ablation of the design space with a mechanistic analysis of the\nmodels' internal representations of image and text inputs. Both approaches\nindicate that the issue stems in the language modelling component of the LLaVA\nmodel. Statistically, we find that switching the language backbone for a\nbilingual language model has the strongest effect on reducing this error.\nMechanistically, we provide compelling evidence that visual inputs are not\nmapped to a similar space as text ones, and that intervening on intermediary\nattention layers can reduce this bias. Our findings provide important insights\nto researchers and engineers seeking to understand the crossover between\nmultimodal and multilingual spaces, and contribute to the goal of developing\ncapable and inclusive VLMs for non-English contexts.\n","authors":["Musashi Hinck","Carolin Holtermann","Matthew Lyle Olson","Florian Schneider","Sungduk Yu","Anahita Bhiwandiwalla","Anne Lauscher","Shaoyen Tseng","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2407.02333v1.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2407.02329v1","updated":"2024-07-02T14:59:37Z","published":"2024-07-02T14:59:37Z","title":"MIGC++: Advanced Multi-Instance Generation Controller for Image\n Synthesis","summary":" We introduce the Multi-Instance Generation (MIG) task, which focuses on\ngenerating multiple instances within a single image, each accurately placed at\npredefined positions with attributes such as category, color, and shape,\nstrictly following user specifications. MIG faces three main challenges:\navoiding attribute leakage between instances, supporting diverse instance\ndescriptions, and maintaining consistency in iterative generation. To address\nattribute leakage, we propose the Multi-Instance Generation Controller (MIGC).\nMIGC generates multiple instances through a divide-and-conquer strategy,\nbreaking down multi-instance shading into single-instance tasks with singular\nattributes, later integrated. To provide more types of instance descriptions,\nwe developed MIGC++. MIGC++ allows attribute control through text \\& images and\nposition control through boxes \\& masks. Lastly, we introduced the\nConsistent-MIG algorithm to enhance the iterative MIG ability of MIGC and\nMIGC++. This algorithm ensures consistency in unmodified regions during the\naddition, deletion, or modification of instances, and preserves the identity of\ninstances when their attributes are changed. We introduce the COCO-MIG and\nMultimodal-MIG benchmarks to evaluate these methods. Extensive experiments on\nthese benchmarks, along with the COCO-Position benchmark and DrawBench,\ndemonstrate that our methods substantially outperform existing techniques,\nmaintaining precise control over aspects including position, attribute, and\nquantity. Project page: https://github.com/limuloo/MIGC.\n","authors":["Dewei Zhou","You Li","Fan Ma","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02315v1","updated":"2024-07-02T14:48:18Z","published":"2024-07-02T14:48:18Z","title":"VFIMamba: Video Frame Interpolation with State Space Models","summary":" Inter-frame modeling is pivotal in generating intermediate frames for video\nframe interpolation (VFI). Current approaches predominantly rely on convolution\nor attention-based models, which often either lack sufficient receptive fields\nor entail significant computational overheads. Recently, Selective State Space\nModels (S6) have emerged, tailored specifically for long sequence modeling,\noffering both linear complexity and data-dependent modeling capabilities. In\nthis paper, we propose VFIMamba, a novel frame interpolation method for\nefficient and dynamic inter-frame modeling by harnessing the S6 model. Our\napproach introduces the Mixed-SSM Block (MSB), which initially rearranges\ntokens from adjacent frames in an interleaved fashion and subsequently applies\nmulti-directional S6 modeling. This design facilitates the efficient\ntransmission of information across frames while upholding linear complexity.\nFurthermore, we introduce a novel curriculum learning strategy that\nprogressively cultivates proficiency in modeling inter-frame dynamics across\nvarying motion magnitudes, fully unleashing the potential of the S6 model.\nExperimental findings showcase that our method attains state-of-the-art\nperformance across diverse benchmarks, particularly excelling in\nhigh-resolution scenarios. In particular, on the X-TEST dataset, VFIMamba\ndemonstrates a noteworthy improvement of 0.80 dB for 4K frames and 0.96 dB for\n2K frames.\n","authors":["Guozhen Zhang","Chunxu Liu","Yutao Cui","Xiaotong Zhao","Kai Ma","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02309v1","updated":"2024-07-02T14:44:01Z","published":"2024-07-02T14:44:01Z","title":"Semantically Guided Representation Learning For Action Anticipation","summary":" Action anticipation is the task of forecasting future activity from a\npartially observed sequence of events. However, this task is exposed to\nintrinsic future uncertainty and the difficulty of reasoning upon\ninterconnected actions. Unlike previous works that focus on extrapolating\nbetter visual and temporal information, we concentrate on learning action\nrepresentations that are aware of their semantic interconnectivity based on\nprototypical action patterns and contextual co-occurrences. To this end, we\npropose the novel Semantically Guided Representation Learning (S-GEAR)\nframework. S-GEAR learns visual action prototypes and leverages language models\nto structure their relationship, inducing semanticity. To gather insights on\nS-GEAR's effectiveness, we test it on four action anticipation benchmarks,\nobtaining improved results compared to previous works: +3.5, +2.7, and +3.5\nabsolute points on Top-1 Accuracy on Epic-Kitchen 55, EGTEA Gaze+ and 50\nSalads, respectively, and +0.8 on Top-5 Recall on Epic-Kitchens 100. We further\nobserve that S-GEAR effectively transfers the geometric associations between\nactions from language to visual prototypes. Finally, S-GEAR opens new research\nfrontiers in anticipation tasks by demonstrating the intricate impact of action\nsemantic interconnectivity.\n","authors":["Anxhelo Diko","Danilo Avola","Bardh Prenkaj","Federico Fontana","Luigi Cinque"],"pdf_url":"https://arxiv.org/pdf/2407.02309v1.pdf","comment":"Accepted as a full paper at ECCV'24 with Paper ID #4140"},{"id":"http://arxiv.org/abs/2407.02286v1","updated":"2024-07-02T14:19:51Z","published":"2024-07-02T14:19:51Z","title":"Rethinking Data Augmentation for Robust LiDAR Semantic Segmentation in\n Adverse Weather","summary":" Existing LiDAR semantic segmentation methods often struggle with performance\ndeclines in adverse weather conditions. Previous research has addressed this\nissue by simulating adverse weather or employing universal data augmentation\nduring training. However, these methods lack a detailed analysis and\nunderstanding of how adverse weather negatively affects LiDAR semantic\nsegmentation performance. Motivated by this issue, we identified key factors of\nadverse weather and conducted a toy experiment to pinpoint the main causes of\nperformance degradation: (1) Geometric perturbation due to refraction caused by\nfog or droplets in the air and (2) Point drop due to energy absorption and\nocclusions. Based on these findings, we propose new strategic data augmentation\ntechniques. First, we introduced a Selective Jittering (SJ) that jitters points\nin the random range of depth (or angle) to mimic geometric perturbation.\nAdditionally, we developed a Learnable Point Drop (LPD) to learn vulnerable\nerase patterns with Deep Q-Learning Network to approximate the point drop\nphenomenon from adverse weather conditions. Without precise weather simulation,\nthese techniques strengthen the LiDAR semantic segmentation model by exposing\nit to vulnerable conditions identified by our data-centric analysis.\nExperimental results confirmed the suitability of the proposed data\naugmentation methods for enhancing robustness against adverse weather\nconditions. Our method attains a remarkable 39.5 mIoU on the\nSemanticKITTI-to-SemanticSTF benchmark, surpassing the previous\nstate-of-the-art by over 5.4%p, tripling the improvement over the baseline\ncompared to previous methods achieved.\n","authors":["Junsung Park","Kyungmin Kim","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2407.02286v1.pdf","comment":"19 pages, 6 figures, accpeted in ECCV 2024"},{"id":"http://arxiv.org/abs/2407.02283v1","updated":"2024-07-02T14:12:21Z","published":"2024-07-02T14:12:21Z","title":"A Refreshed Similarity-based Upsampler for Direct High-Ratio Feature\n Upsampling","summary":" Feature upsampling is a fundamental and indispensable ingredient of almost\nall current network structures for image segmentation tasks. Recently, a\npopular similarity-based feature upsampling pipeline has been proposed, which\nutilizes a high-resolution feature as guidance to help upsample the\nlow-resolution deep feature based on their local similarity. Albeit achieving\npromising performance, this pipeline has specific limitations: 1) HR query and\nLR key features are not well aligned; 2) the similarity between query-key\nfeatures is computed based on the fixed inner product form; 3) neighbor\nselection is coarsely operated on LR features, resulting in mosaic artifacts.\nThese shortcomings make the existing methods along this pipeline primarily\napplicable to hierarchical network architectures with iterative features as\nguidance and they are not readily extended to a broader range of structures,\nespecially for a direct high-ratio upsampling. Against the issues, we\nmeticulously optimize every methodological design. Specifically, we firstly\npropose an explicitly controllable query-key feature alignment from both\nsemantic-aware and detail-aware perspectives, and then construct a\nparameterized paired central difference convolution block for flexibly\ncalculating the similarity between the well-aligned query-key features.\nBesides, we develop a fine-grained neighbor selection strategy on HR features,\nwhich is simple yet effective for alleviating mosaic artifacts. Based on these\ncareful designs, we systematically construct a refreshed similarity-based\nfeature upsampling framework named ReSFU. Extensive experiments substantiate\nthat our proposed ReSFU is finely applicable to various types of architectures\nin a direct high-ratio upsampling manner, and consistently achieves\nsatisfactory performance on different segmentation applications, showing\nsuperior generality and ease of deployment.\n","authors":["Minghao Zhou","Hong Wang","Yefeng Zheng","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2407.02283v1.pdf","comment":"Codes are available at https://github.com/zmhhmz/ReSFU"},{"id":"http://arxiv.org/abs/2407.02280v1","updated":"2024-07-02T14:08:55Z","published":"2024-07-02T14:08:55Z","title":"FedIA: Federated Medical Image Segmentation with Heterogeneous\n Annotation Completeness","summary":" Federated learning has emerged as a compelling paradigm for medical image\nsegmentation, particularly in light of increasing privacy concerns. However,\nmost of the existing research relies on relatively stringent assumptions\nregarding the uniformity and completeness of annotations across clients.\nContrary to this, this paper highlights a prevalent challenge in medical\npractice: incomplete annotations. Such annotations can introduce incorrectly\nlabeled pixels, potentially undermining the performance of neural networks in\nsupervised learning. To tackle this issue, we introduce a novel solution, named\nFedIA. Our insight is to conceptualize incomplete annotations as noisy data\n(\\textit{i.e.}, low-quality data), with a focus on mitigating their adverse\neffects. We begin by evaluating the completeness of annotations at the client\nlevel using a designed indicator. Subsequently, we enhance the influence of\nclients with more comprehensive annotations and implement corrections for\nincomplete ones, thereby ensuring that models are trained on accurate data. Our\nmethod's effectiveness is validated through its superior performance on two\nextensively used medical image segmentation datasets, outperforming existing\nsolutions. The code is available at https://github.com/HUSTxyy/FedIA.\n","authors":["Yangyang Xiang","Nannan Wu","Li Yu","Xin Yang","Kwang-Ting Cheng","Zengqiang Yan"],"pdf_url":"https://arxiv.org/pdf/2407.02280v1.pdf","comment":"Early accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.02272v1","updated":"2024-07-02T14:01:59Z","published":"2024-07-02T14:01:59Z","title":"Aligning Human Motion Generation with Human Perceptions","summary":" Human motion generation is a critical task with a wide range of applications.\nAchieving high realism in generated motions requires naturalness, smoothness,\nand plausibility. Despite rapid advancements in the field, current generation\nmethods often fall short of these goals. Furthermore, existing evaluation\nmetrics typically rely on ground-truth-based errors, simple heuristics, or\ndistribution distances, which do not align well with human perceptions of\nmotion quality. In this work, we propose a data-driven approach to bridge this\ngap by introducing a large-scale human perceptual evaluation dataset,\nMotionPercept, and a human motion critic model, MotionCritic, that capture\nhuman perceptual preferences. Our critic model offers a more accurate metric\nfor assessing motion quality and could be readily integrated into the motion\ngeneration pipeline to enhance generation quality. Extensive experiments\ndemonstrate the effectiveness of our approach in both evaluating and improving\nthe quality of generated human motions by aligning with human perceptions. Code\nand data are publicly available at https://motioncritic.github.io/.\n","authors":["Haoru Wang","Wentao Zhu","Luyi Miao","Yishu Xu","Feng Gao","Qi Tian","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02272v1.pdf","comment":"Project page: https://motioncritic.github.io/"},{"id":"http://arxiv.org/abs/2401.09759v2","updated":"2024-07-02T13:43:59Z","published":"2024-01-18T07:19:10Z","title":"SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech\n Recognition","summary":" Audio-visual speech recognition (AVSR) is a multimodal extension of automatic\nspeech recognition (ASR), using video as a complement to audio. In AVSR,\nconsiderable efforts have been directed at datasets for facial features such as\nlip-readings, while they often fall short in evaluating the image comprehension\ncapabilities in broader contexts. In this paper, we construct SlideAVSR, an\nAVSR dataset using scientific paper explanation videos. SlideAVSR provides a\nnew benchmark where models transcribe speech utterances with texts on the\nslides on the presentation recordings. As technical terminologies that are\nfrequent in paper explanations are notoriously challenging to transcribe\nwithout reference texts, our SlideAVSR dataset spotlights a new aspect of AVSR\nproblems. As a simple yet effective baseline, we propose DocWhisper, an AVSR\nmodel that can refer to textual information from slides, and confirm its\neffectiveness on SlideAVSR.\n","authors":["Hao Wang","Shuhei Kurita","Shuichiro Shimizu","Daisuke Kawahara"],"pdf_url":"https://arxiv.org/pdf/2401.09759v2.pdf","comment":"3rd Workshop on Advances in Language and Vision Research (ALVR 2024)"},{"id":"http://arxiv.org/abs/2407.02264v1","updated":"2024-07-02T13:40:56Z","published":"2024-07-02T13:40:56Z","title":"SOAF: Scene Occlusion-aware Neural Acoustic Field","summary":" This paper tackles the problem of novel view audio-visual synthesis along an\narbitrary trajectory in an indoor scene, given the audio-video recordings from\nother known trajectories of the scene. Existing methods often overlook the\neffect of room geometry, particularly wall occlusion to sound propagation,\nmaking them less accurate in multi-room environments. In this work, we propose\na new approach called Scene Occlusion-aware Acoustic Field (SOAF) for accurate\nsound generation. Our approach derives a prior for sound energy field using\ndistance-aware parametric sound-propagation modelling and then transforms it\nbased on scene transmittance learned from the input video. We extract features\nfrom the local acoustic field centred around the receiver using a Fibonacci\nSphere to generate binaural audio for novel views with a direction-aware\nattention mechanism. Extensive experiments on the real dataset~\\emph{RWAVS} and\nthe synthetic dataset~\\emph{SoundSpaces} demonstrate that our method\noutperforms previous state-of-the-art techniques in audio generation. Project\npage: https://github.com/huiyu-gao/SOAF/.\n","authors":["Huiyu Gao","Jiahao Ma","David Ahmedt-Aristizabal","Chuong Nguyen","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02261v1","updated":"2024-07-02T13:38:06Z","published":"2024-07-02T13:38:06Z","title":"Federated Distillation for Medical Image Classification: Towards\n Trustworthy Computer-Aided Diagnosis","summary":" Medical image classification plays a crucial role in computer-aided clinical\ndiagnosis. While deep learning techniques have significantly enhanced\nefficiency and reduced costs, the privacy-sensitive nature of medical imaging\ndata complicates centralized storage and model training. Furthermore,\nlow-resource healthcare organizations face challenges related to communication\noverhead and efficiency due to increasing data and model scales. This paper\nproposes a novel privacy-preserving medical image classification framework\nbased on federated learning to address these issues, named FedMIC. The\nframework enables healthcare organizations to learn from both global and local\nknowledge, enhancing local representation of private data despite statistical\nheterogeneity. It provides customized models for organizations with diverse\ndata distributions while minimizing communication overhead and improving\nefficiency without compromising performance. Our FedMIC enhances robustness and\npractical applicability under resource-constrained conditions. We demonstrate\nFedMIC's effectiveness using four public medical image datasets for classical\nmedical image classification tasks.\n","authors":["Sufen Ren","Yule Hu","Shengchao Chen","Guanjun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02261v1.pdf","comment":"work in progress. arXiv admin note: text overlap with\n arXiv:2401.01493"},{"id":"http://arxiv.org/abs/2407.02253v1","updated":"2024-07-02T13:18:15Z","published":"2024-07-02T13:18:15Z","title":"Parameter-Selective Continual Test-Time Adaptation","summary":" Continual Test-Time Adaptation (CTTA) aims to adapt a pretrained model to\never-changing environments during the test time under continuous domain shifts.\nMost existing CTTA approaches are based on the Mean Teacher (MT) structure,\nwhich contains a student and a teacher model, where the student is updated\nusing the pseudo-labels from the teacher model, and the teacher is then updated\nby exponential moving average strategy. However, these methods update the MT\nmodel indiscriminately on all parameters of the model. That is, some critical\nparameters involving sharing knowledge across different domains may be erased,\nintensifying error accumulation and catastrophic forgetting. In this paper, we\nintroduce Parameter-Selective Mean Teacher (PSMT) method, which is capable of\neffectively updating the critical parameters within the MT network under domain\nshifts. First, we introduce a selective distillation mechanism in the student\nmodel, which utilizes past knowledge to regularize novel knowledge, thereby\nmitigating the impact of error accumulation. Second, to avoid catastrophic\nforgetting, in the teacher model, we create a mask through Fisher information\nto selectively update parameters via exponential moving average, with\npreservation measures applied to crucial parameters. Extensive experimental\nresults verify that PSMT outperforms state-of-the-art methods across multiple\nbenchmark datasets. Our code is available at\n\\url{https://github.com/JiaxuTian/PSMT}.\n","authors":["Jiaxu Tian","Fan Lyu"],"pdf_url":"https://arxiv.org/pdf/2407.02253v1.pdf","comment":"17pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.02252v1","updated":"2024-07-02T13:17:49Z","published":"2024-07-02T13:17:49Z","title":"GlyphDraw2: Automatic Generation of Complex Glyph Posters with Diffusion\n Models and Large Language Models","summary":" Posters play a crucial role in marketing and advertising, contributing\nsignificantly to industrial design by enhancing visual communication and brand\nvisibility. With recent advances in controllable text-to-image diffusion\nmodels, more concise research is now focusing on rendering text within\nsynthetic images. Despite improvements in text rendering accuracy, the field of\nend-to-end poster generation remains underexplored. This complex task involves\nstriking a balance between text rendering accuracy and automated layout to\nproduce high-resolution images with variable aspect ratios. To tackle this\nchallenge, we propose an end-to-end text rendering framework employing a triple\ncross-attention mechanism rooted in align learning, designed to create precise\nposter text within detailed contextual backgrounds. Additionally, we introduce\na high-resolution dataset that exceeds 1024 pixels in image resolution. Our\napproach leverages the SDXL architecture. Extensive experiments validate the\nability of our method to generate poster images featuring intricate and\ncontextually rich backgrounds. Codes will be available at\nhttps://github.com/OPPO-Mente-Lab/GlyphDraw2.\n","authors":["Jian Ma","Yonglin Deng","Chen Chen","Haonan Lu","Zhenyu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02248v1","updated":"2024-07-02T13:12:52Z","published":"2024-07-02T13:12:52Z","title":"EvolBA: Evolutionary Boundary Attack under Hard-label Black Box\n condition","summary":" Research has shown that deep neural networks (DNNs) have vulnerabilities that\ncan lead to the misrecognition of Adversarial Examples (AEs) with specifically\ndesigned perturbations. Various adversarial attack methods have been proposed\nto detect vulnerabilities under hard-label black box (HL-BB) conditions in the\nabsence of loss gradients and confidence scores.However, these methods fall\ninto local solutions because they search only local regions of the search\nspace. Therefore, this study proposes an adversarial attack method named EvolBA\nto generate AEs using Covariance Matrix Adaptation Evolution Strategy (CMA-ES)\nunder the HL-BB condition, where only a class label predicted by the target DNN\nmodel is available. Inspired by formula-driven supervised learning, the\nproposed method introduces domain-independent operators for the initialization\nprocess and a jump that enhances search exploration. Experimental results\nconfirmed that the proposed method could determine AEs with smaller\nperturbations than previous methods in images where the previous methods have\ndifficulty.\n","authors":["Ayane Tajima","Satoshi Ono"],"pdf_url":"https://arxiv.org/pdf/2407.02248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01449v2","updated":"2024-07-02T13:02:58Z","published":"2024-06-27T15:45:29Z","title":"ColPali: Efficient Document Retrieval with Vision Language Models","summary":" Documents are visually rich structures that convey information through text,\nas well as tables, figures, page layouts, or fonts. While modern document\nretrieval systems exhibit strong performance on query-to-text matching, they\nstruggle to exploit visual cues efficiently, hindering their performance on\npractical document retrieval applications such as Retrieval Augmented\nGeneration. To benchmark current systems on visually rich document retrieval,\nwe introduce the Visual Document Retrieval Benchmark ViDoRe, composed of\nvarious page-level retrieving tasks spanning multiple domains, languages, and\nsettings. The inherent shortcomings of modern systems motivate the introduction\nof a new retrieval model architecture, ColPali, which leverages the document\nunderstanding capabilities of recent Vision Language Models to produce\nhigh-quality contextualized embeddings solely from images of document pages.\nCombined with a late interaction matching mechanism, ColPali largely\noutperforms modern document retrieval pipelines while being drastically faster\nand end-to-end trainable.\n","authors":["Manuel Faysse","Hugues Sibille","Tony Wu","Bilel Omrani","Gautier Viaud","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2407.01449v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2407.02241v1","updated":"2024-07-02T13:02:51Z","published":"2024-07-02T13:02:51Z","title":"Sign Language Recognition Based On Facial Expression and Hand Skeleton","summary":" Sign language is a visual language used by the deaf and dumb community to\ncommunicate. However, for most recognition methods based on monocular cameras,\nthe recognition accuracy is low and the robustness is poor. Even if the effect\nis good on some data, it may perform poorly in other data with different\ninterference due to the inability to extract effective features. To solve these\nproblems, we propose a sign language recognition network that integrates\nskeleton features of hands and facial expression. Among this, we propose a hand\nskeleton feature extraction based on coordinate transformation to describe the\nshape of the hand more accurately. Moreover, by incorporating facial expression\ninformation, the accuracy and robustness of sign language recognition are\nfinally improved, which was verified on A Dataset for Argentinian Sign Language\nand SEU's Chinese Sign Language Recognition Database (SEUCSLRD).\n","authors":["Zhiyu Long","Xingyou Liu","Jiaqi Qiao","Zhi Li"],"pdf_url":"https://arxiv.org/pdf/2407.02241v1.pdf","comment":"2023 38th Youth Academic Annual Conference of Chinese Association of\n Automation (YAC)"},{"id":"http://arxiv.org/abs/2407.02229v1","updated":"2024-07-02T12:54:32Z","published":"2024-07-02T12:54:32Z","title":"LaMoD: Latent Motion Diffusion Model For Myocardial Strain Generation","summary":" Motion and deformation analysis of cardiac magnetic resonance (CMR) imaging\nvideos is crucial for assessing myocardial strain of patients with abnormal\nheart functions. Recent advances in deep learning-based image registration\nalgorithms have shown promising results in predicting motion fields from\nroutinely acquired CMR sequences. However, their accuracy often diminishes in\nregions with subtle appearance change, with errors propagating over time.\nAdvanced imaging techniques, such as displacement encoding with stimulated\nechoes (DENSE) CMR, offer highly accurate and reproducible motion data but\nrequire additional image acquisition, which poses challenges in busy clinical\nflows. In this paper, we introduce a novel Latent Motion Diffusion model\n(LaMoD) to predict highly accurate DENSE motions from standard CMR videos. More\nspecifically, our method first employs an encoder from a pre-trained\nregistration network that learns latent motion features (also considered as\ndeformation-based shape features) from image sequences. Supervised by the\nground-truth motion provided by DENSE, LaMoD then leverages a probabilistic\nlatent diffusion model to reconstruct accurate motion from these extracted\nfeatures. Experimental results demonstrate that our proposed method, LaMoD,\nsignificantly improves the accuracy of motion analysis in standard CMR images;\nhence improving myocardial strain analysis in clinical settings for cardiac\npatients. Our code will be publicly available on upon acceptance.\n","authors":["Jiarui Xing","Nivetha Jayakumar","Nian Wu","Yu Wang","Frederick H. Epstein","Miaomiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02228v1","updated":"2024-07-02T12:52:18Z","published":"2024-07-02T12:52:18Z","title":"MTMamba: Enhancing Multi-Task Dense Scene Understanding by Mamba-Based\n Decoders","summary":" Multi-task dense scene understanding, which learns a model for multiple dense\nprediction tasks, has a wide range of application scenarios. Modeling\nlong-range dependency and enhancing cross-task interactions are crucial to\nmulti-task dense prediction. In this paper, we propose MTMamba, a novel\nMamba-based architecture for multi-task scene understanding. It contains two\ntypes of core blocks: self-task Mamba (STM) block and cross-task Mamba (CTM)\nblock. STM handles long-range dependency by leveraging Mamba, while CTM\nexplicitly models task interactions to facilitate information exchange across\ntasks. Experiments on NYUDv2 and PASCAL-Context datasets demonstrate the\nsuperior performance of MTMamba over Transformer-based and CNN-based methods.\nNotably, on the PASCAL-Context dataset, MTMamba achieves improvements of +2.08,\n+5.01, and +4.90 over the previous best method in the tasks of semantic\nsegmentation, human parsing, and object boundary detection, respectively. The\ncode is available at \\url{https://github.com/EnVision-Research/MTMamba}.\n","authors":["Baijiong Lin","Weisen Jiang","Pengguang Chen","Yu Zhang","Shu Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2407.02228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02222v1","updated":"2024-07-02T12:41:51Z","published":"2024-07-02T12:41:51Z","title":"Detecting Driver Fatigue With Eye Blink Behavior","summary":" Traffic accidents, causing millions of deaths and billions of dollars in\neconomic losses each year globally, have become a significant issue. One of the\nmain causes of these accidents is drivers being sleepy or fatigued. Recently,\nvarious studies have focused on detecting drivers' sleep/wake states using\ncamera-based solutions that do not require physical contact with the driver,\nthereby enhancing ease of use. In this study, besides the eye blink frequency,\na driver adaptive eye blink behavior feature set have been evaluated to detect\nthe fatigue status. It is observed from the results that behavior of eye blink\ncarries useful information on fatigue detection. The developed image-based\nsystem provides a solution that can work adaptively to the physical\ncharacteristics of the drivers and their positions in the vehicle\n","authors":["Ali Akin","Habil Kalkan"],"pdf_url":"https://arxiv.org/pdf/2407.02222v1.pdf","comment":"9 pages, 4 figures 3 tables"},{"id":"http://arxiv.org/abs/2407.00438v2","updated":"2024-07-02T12:40:04Z","published":"2024-06-29T13:15:05Z","title":"AI Age Discrepancy: A Novel Parameter for Frailty Assessment in Kidney\n Tumor Patients","summary":" Kidney cancer is a global health concern, and accurate assessment of patient\nfrailty is crucial for optimizing surgical outcomes. This paper introduces AI\nAge Discrepancy, a novel metric derived from machine learning analysis of\npreoperative abdominal CT scans, as a potential indicator of frailty and\npostoperative risk in kidney cancer patients. This retrospective study of 599\npatients from the 2023 Kidney Tumor Segmentation (KiTS) challenge dataset found\nthat a higher AI Age Discrepancy is significantly associated with longer\nhospital stays and lower overall survival rates, independent of established\nfactors. This suggests that AI Age Discrepancy may provide valuable insights\ninto patient frailty and could thus inform clinical decision-making in kidney\ncancer treatment.\n","authors":["Rikhil Seshadri","Jayant Siva","Angelica Bartholomew","Clara Goebel","Gabriel Wallerstein-King","Beatriz López Morato","Nicholas Heller","Jason Scovell","Rebecca Campbell","Andrew Wood","Michal Ozery-Flato","Vesna Barros","Maria Gabrani","Michal Rosen-Zvi","Resha Tejpaul","Vidhyalakshmi Ramesh","Nikolaos Papanikolopoulos","Subodh Regmi","Ryan Ward","Robert Abouassaly","Steven C. Campbell","Erick Remer","Christopher Weight"],"pdf_url":"https://arxiv.org/pdf/2407.00438v2.pdf","comment":"10 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.04303v2","updated":"2024-07-02T12:39:46Z","published":"2024-06-06T17:49:21Z","title":"Vision-LSTM: xLSTM as Generic Vision Backbone","summary":" Transformers are widely used as generic backbones in computer vision, despite\ninitially introduced for natural language processing. Recently, the Long\nShort-Term Memory (LSTM) has been extended to a scalable and performant\narchitecture - the xLSTM - which overcomes long-standing LSTM limitations via\nexponential gating and parallelizable matrix memory structure. In this report,\nwe introduce Vision-LSTM (ViL), an adaption of the xLSTM building blocks to\ncomputer vision. ViL comprises a stack of xLSTM blocks where odd blocks process\nthe sequence of patch tokens from top to bottom while even blocks go from\nbottom to top. Experiments show that ViL holds promise to be further deployed\nas new generic backbone for computer vision architectures.\n","authors":["Benedikt Alkin","Maximilian Beck","Korbinian Pöppel","Sepp Hochreiter","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2406.04303v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02218v1","updated":"2024-07-02T12:34:17Z","published":"2024-07-02T12:34:17Z","title":"Multi-Modal Video Dialog State Tracking in the Wild","summary":" We present MST-MIXER - a novel video dialog model operating over a generic\nmulti-modal state tracking scheme. Current models that claim to perform\nmulti-modal state tracking fall short of two major aspects: (1) They either\ntrack only one modality (mostly the visual input) or (2) they target synthetic\ndatasets that do not reflect the complexity of real-world in the wild\nscenarios. Our model addresses these two limitations in an attempt to close\nthis crucial research gap. Specifically, MST-MIXER first tracks the most\nimportant constituents of each input modality. Then, it predicts the missing\nunderlying structure of the selected constituents of each modality by learning\nlocal latent graphs using a novel multi-modal graph structure learning method.\nSubsequently, the learned local graphs and features are parsed together to form\na global graph operating on the mix of all modalities which further refines its\nstructure and node embeddings. Finally, the fine-grained graph node features\nare used to enhance the hidden states of the backbone Vision-Language Model\n(VLM). MST-MIXER achieves new state-of-the-art results on five challenging\nbenchmarks.\n","authors":["Adnen Abdessaied","Lei Shi","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2407.02218v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2304.04059v2","updated":"2024-07-02T12:28:08Z","published":"2023-04-08T16:12:36Z","title":"Universal Semi-Supervised Learning for Medical Image Classification","summary":" Semi-supervised learning (SSL) has attracted much attention since it reduces\nthe expensive costs of collecting adequate well-labeled training data,\nespecially for deep learning methods. However, traditional SSL is built upon an\nassumption that labeled and unlabeled data should be from the same distribution\n\\textit{e.g.,} classes and domains. However, in practical scenarios, unlabeled\ndata would be from unseen classes or unseen domains, and it is still\nchallenging to exploit them by existing SSL methods. Therefore, in this paper,\nwe proposed a unified framework to leverage these unseen unlabeled data for\nopen-scenario semi-supervised medical image classification. We first design a\nnovel scoring mechanism, called dual-path outliers estimation, to identify\nsamples from unseen classes. Meanwhile, to extract unseen-domain samples, we\nthen apply an effective variational autoencoder (VAE) pre-training. After that,\nwe conduct domain adaptation to fully exploit the value of the detected\nunseen-domain samples to boost semi-supervised training. We evaluated our\nproposed framework on dermatology and ophthalmology tasks. Extensive\nexperiments demonstrate our model can achieve superior classification\nperformance in various medical SSL scenarios. The code implementations are\naccessible at: https://github.com/PyJulie/USSL4MIC.\n","authors":["Lie Ju","Yicheng Wu","Wei Feng","Zhen Yu","Lin Wang","Zhuoting Zhu","Zongyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2304.04059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11848v3","updated":"2024-07-02T12:16:31Z","published":"2024-03-18T15:00:38Z","title":"GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object\n Detection","summary":" Integrating LiDAR and camera information into Bird's-Eye-View (BEV)\nrepresentation has emerged as a crucial aspect of 3D object detection in\nautonomous driving. However, existing methods are susceptible to the inaccurate\ncalibration relationship between LiDAR and the camera sensor. Such inaccuracies\nresult in errors in depth estimation for the camera branch, ultimately causing\nmisalignment between LiDAR and camera BEV features. In this work, we propose a\nrobust fusion framework called Graph BEV. Addressing errors caused by\ninaccurate point cloud projection, we introduce a Local Align module that\nemploys neighbor-aware depth features via Graph matching. Additionally, we\npropose a Global Align module to rectify the misalignment between LiDAR and\ncamera BEV features. Our Graph BEV framework achieves state-of-the-art\nperformance, with an mAP of 70.1\\%, surpassing BEV Fusion by 1.6\\% on the\nnuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by\n8.3\\% under conditions with misalignment noise.\n","authors":["Ziying Song","Lei Yang","Shaoqing Xu","Lin Liu","Dongyang Xu","Caiyan Jia","Feiyang Jia","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11848v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.09858v5","updated":"2024-07-02T12:11:39Z","published":"2021-07-21T02:59:59Z","title":"Weighted Intersection over Union (wIoU) for Evaluating Image\n Segmentation","summary":" In recent years, many semantic segmentation methods have been proposed to\npredict label of pixels in the scene. In general, we measure area prediction\nerrors or boundary prediction errors for comparing methods. However, there is\nno intuitive evaluation metric that evaluates both aspects. In this work, we\npropose a new evaluation measure called weighted Intersection over Union (wIoU)\nfor semantic segmentation. First, it builds a weight map generated from a\nboundary distance map, allowing weighted evaluation for each pixel based on a\nboundary importance factor. The proposed wIoU can evaluate both contour and\nregion by setting a boundary importance factor. We validated the effectiveness\nof wIoU on a dataset of 33 scenes and demonstrated its flexibility. Using the\nproposed metric, we expect more flexible and intuitive evaluation in semantic\nsegmentation field are possible.\n","authors":["Yeong-Jun Cho"],"pdf_url":"https://arxiv.org/pdf/2107.09858v5.pdf","comment":"9 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.12574v3","updated":"2024-07-02T12:08:55Z","published":"2023-10-19T08:33:23Z","title":"A reproducible 3D convolutional neural network with dual attention\n module (3D-DAM) for Alzheimer's disease classification","summary":" Alzheimer's disease is one of the most common types of neurodegenerative\ndisease, characterized by the accumulation of amyloid-beta plaque and tau\ntangles. Recently, deep learning approaches have shown promise in Alzheimer's\ndisease diagnosis. In this study, we propose a reproducible model that utilizes\na 3D convolutional neural network with a dual attention module for Alzheimer's\ndisease classification. We trained the model in the ADNI database and verified\nthe generalizability of our method in two independent datasets (AIBL and\nOASIS1). Our method achieved state-of-the-art classification performance, with\nan accuracy of 91.94% for MCI progression classification and 96.30% for\nAlzheimer's disease classification on the ADNI dataset. Furthermore, the model\ndemonstrated good generalizability, achieving an accuracy of 86.37% on the AIBL\ndataset and 83.42% on the OASIS1 dataset. These results indicate that our\nproposed approach has competitive performance and generalizability when\ncompared to recent studies in the field.\n","authors":["Gia Minh Hoang","Youngjoo Lee","Jae Gwan Kim"],"pdf_url":"https://arxiv.org/pdf/2310.12574v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02197v1","updated":"2024-07-02T11:56:14Z","published":"2024-07-02T11:56:14Z","title":"Research on Reliable and Safe Occupancy Grid Prediction in Underground\n Parking Lots","summary":" Against the backdrop of advancing science and technology, autonomous vehicle\ntechnology has emerged as a focal point of intense scrutiny within the academic\ncommunity. Nevertheless, the challenge persists in guaranteeing the safety and\nreliability of this technology when navigating intricate scenarios. While a\nsubstantial portion of autonomous driving research is dedicated to testing in\nopen-air environments, such as urban roads and highways, where the myriad\nvariables at play are meticulously examined, enclosed indoor spaces like\nunderground parking lots have, to a significant extent, been overlooked in the\nscholarly discourse. This discrepancy highlights a gap in derstanding the\nunique challenges these confined settings pose for autonomous navigation\nsystems.\n This study tackles indoor autonomous driving, particularly in overlooked\nspaces like underground parking lots. Using CARLA's simulation platform, a\nrealistic parking model is created for data gathering. An occupancy grid\nnetwork then processes this data to predict vehicle paths and obstacles,\nenhancing the system's perception in complex indoor environments. Ultimately,\nthis strategy improves safety in autonomous parking operations. The paper\nmeticulously evaluates the model's predictive capabilities, validating its\nefficacy in the context of underground parking. Our findings confirm that the\nproposed strategy successfully enhances autonomous vehicle performance in these\ncomplex indoor settings. It equips autonomous systems with improved adaptation\nto underground lots, reinforcing safety measures and dependability. This work\npaves the way for future advancements and applications by addressing the\nresearch shortfall concerning indoor parking environments, serving as a pivotal\nreference point.\n","authors":["JiaQi Luo"],"pdf_url":"https://arxiv.org/pdf/2407.02197v1.pdf","comment":"15 pages, 19 figures"},{"id":"http://arxiv.org/abs/2407.02188v1","updated":"2024-07-02T11:46:07Z","published":"2024-07-02T11:46:07Z","title":"Structure-Aware Consensus Network on Graphs with Few Labeled Nodes","summary":" Graph node classification with few labeled nodes presents significant\nchallenges due to limited supervision. Conventional methods often exploit the\ngraph in a transductive learning manner. They fail to effectively utilize the\nabundant unlabeled data and the structural information inherent in graphs. To\naddress these issues, we introduce a Structure-Aware Consensus Network (SACN)\nfrom three perspectives. Firstly, SACN leverages a novel structure-aware\nconsensus learning strategy between two strongly augmented views. The proposed\nstrategy can fully exploit the potentially useful information of the unlabeled\nnodes and the structural information of the entire graph. Secondly, SACN\nuniquely integrates the graph's structural information to achieve\nstrong-to-strong consensus learning, improving the utilization of unlabeled\ndata while maintaining multiview learning. Thirdly, unlike two-branch graph\nneural network-based methods, SACN is designed for multiview feature learning\nwithin a single-branch architecture. Furthermore, a class-aware pseudolabel\nselection strategy helps address class imbalance and achieve effective\nweak-to-strong supervision. Extensive experiments on three benchmark datasets\ndemonstrate SACN's superior performance in node classification tasks,\nparticularly at very low label rates, outperforming state-of-the-art methods\nwhile maintaining computational simplicity.The source code is available at\nhttps://github.com/kunzhan/SACN\n","authors":["Shuaike Xu","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2407.02188v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2407.02187v1","updated":"2024-07-02T11:45:56Z","published":"2024-07-02T11:45:56Z","title":"Virtually Objective Quantification of in vitro Wound Healing Scratch\n Assays with the Segment Anything Model","summary":" The in vitro scratch assay is a widely used assay in cell biology to assess\nthe rate of wound closure related to a variety of therapeutic interventions.\nWhile manual measurement is subjective and vulnerable to intra- and\ninterobserver variability, computer-based tools are theoretically objective,\nbut in practice often contain parameters which are manually adjusted\n(individually per image or data set) and thereby provide a source for\nsubjectivity. Modern deep learning approaches typically require large annotated\ntraining data which complicates instant applicability. In this paper, we make\nuse of the segment anything model, a deep foundation model based on interactive\npoint-prompts, which enables class-agnostic segmentation without tuning the\nnetwork's parameters based on domain specific training data. The proposed\nmethod clearly outperformed a semi-objective baseline method that required\nmanual inspection and, if necessary, adjustment of parameters per image. Even\nthough the point prompts of the proposed approach are theoretically also a\nsource for subjectivity, results attested very low intra- and interobserver\nvariability, even compared to manual segmentation of domain experts.\n","authors":["Katja Löwenstein","Johanna Rehrl","Anja Schuster","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2407.02187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02182v1","updated":"2024-07-02T11:41:12Z","published":"2024-07-02T11:41:12Z","title":"Occlusion-Aware Seamless Segmentation","summary":" Panoramic images can broaden the Field of View (FoV), occlusion-aware\nprediction can deepen the understanding of the scene, and domain adaptation can\ntransfer across viewing domains. In this work, we introduce a novel task,\nOcclusion-Aware Seamless Segmentation (OASS), which simultaneously tackles all\nthese three challenges. For benchmarking OASS, we establish a new\nhuman-annotated dataset for Blending Panoramic Amodal Seamless Segmentation,\ni.e., BlendPASS. Besides, we propose the first solution UnmaskFormer, aiming at\nunmasking the narrow FoV, occlusions, and domain gaps all at once.\nSpecifically, UnmaskFormer includes the crucial designs of Unmasking Attention\n(UA) and Amodal-oriented Mix (AoMix). Our method achieves state-of-the-art\nperformance on the BlendPASS dataset, reaching a remarkable mAPQ of 26.58% and\nmIoU of 43.66%. On public panoramic semantic segmentation datasets, i.e.,\nSynPASS and DensePASS, our method outperforms previous methods and obtains\n45.34% and 48.08% in mIoU, respectively. The fresh BlendPASS dataset and our\nsource code will be made publicly available at\nhttps://github.com/yihong-97/OASS.\n","authors":["Yihong Cao","Jiaming Zhang","Hao Shi","Kunyu Peng","Yuhongxuan Zhang","Hui Zhang","Rainer Stiefelhagen","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02182v1.pdf","comment":"Accepted to ECCV 2024. The fresh dataset and the source code will be\n made publicly available at https://github.com/yihong-97/OASS"},{"id":"http://arxiv.org/abs/2407.02174v1","updated":"2024-07-02T11:28:22Z","published":"2024-07-02T11:28:22Z","title":"BeNeRF: Neural Radiance Fields from a Single Blurry Image and Event\n Stream","summary":" Neural implicit representation of visual scenes has attracted a lot of\nattention in recent research of computer vision and graphics. Most prior\nmethods focus on how to reconstruct 3D scene representation from a set of\nimages. In this work, we demonstrate the possibility to recover the neural\nradiance fields (NeRF) from a single blurry image and its corresponding event\nstream. We model the camera motion with a cubic B-Spline in SE(3) space. Both\nthe blurry image and the brightness change within a time interval, can then be\nsynthesized from the 3D scene representation given the 6-DoF poses interpolated\nfrom the cubic B-Spline. Our method can jointly learn both the implicit neural\nscene representation and recover the camera motion by minimizing the\ndifferences between the synthesized data and the real measurements without\npre-computed camera poses from COLMAP. We evaluate the proposed method with\nboth synthetic and real datasets. The experimental results demonstrate that we\nare able to render view-consistent latent sharp images from the learned NeRF\nand bring a blurry image alive in high quality. Code and data are available at\nhttps://github.com/WU-CVGL/BeNeRF.\n","authors":["Wenpu Li","Pian Wan","Peng Wang","Jinhang Li","Yi Zhou","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02174v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.02172v1","updated":"2024-07-02T11:26:37Z","published":"2024-07-02T11:26:37Z","title":"RETINA: a hardware-in-the-loop optical facility with reduced optical\n aberrations","summary":" The increasing interest in spacecraft autonomy and the complex tasks to be\naccomplished by the spacecraft raise the need for a trustworthy approach to\nperform Verification & Validation of Guidance, Navigation, and Control\nalgorithms. In the context of autonomous operations, vision-based navigation\nalgorithms have established themselves as effective solutions to determine the\nspacecraft state in orbit with low-cost and versatile sensors. Nevertheless,\ndetailed testing must be performed on ground to understand the algorithm's\nrobustness and performance on flight hardware. Given the impossibility of\ntesting directly on orbit these algorithms, a dedicated simulation framework\nmust be developed to emulate the orbital environment in a laboratory setup.\nThis paper presents the design of a low-aberration optical facility called\nRETINA to perform this task. RETINA is designed to accommodate cameras with\ndifferent characteristics (e.g., sensor size and focal length) while ensuring\nthe correct stimulation of the camera detector. A preliminary design is\nperformed to identify the range of possible components to be used in the\nfacility according to the facility requirements. Then, a detailed optical\ndesign is performed in Zemax OpticStudio to optimize the number and\ncharacteristics of the lenses composing the facility's optical systems. The\nfinal design is compared against the preliminary design to show the superiority\nof the optical performance achieved with this approach. This work presents also\na calibration procedure to estimate the misalignment and the centering errors\nin the facility. These estimated parameters are used in a dedicated\ncompensation algorithm, enabling the stimulation of the camera at tens of\narcseconds of precision. Finally, two different applications are presented to\nshow the versatility of RETINA in accommodating different cameras and in\nsimulating different mission scenarios.\n","authors":["Paolo Panicucci","Fabio Ornati","Francesco Topputo"],"pdf_url":"https://arxiv.org/pdf/2407.02172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14953v2","updated":"2024-07-02T11:22:36Z","published":"2024-06-21T08:04:12Z","title":"Deep Imbalanced Regression to Estimate Vascular Age from PPG Data: a\n Novel Digital Biomarker for Cardiovascular Health","summary":" Photoplethysmography (PPG) is emerging as a crucial tool for monitoring human\nhemodynamics, with recent studies highlighting its potential in assessing\nvascular aging through deep learning. However, real-world age distributions are\noften imbalanced, posing significant challenges for deep learning models. In\nthis paper, we introduce a novel, simple, and effective loss function named the\nDist Loss to address deep imbalanced regression tasks. We trained a\none-dimensional convolutional neural network (Net1D) incorporating the Dist\nLoss on the extensive UK Biobank dataset (n=502,389) to estimate vascular age\nfrom PPG signals and validate its efficacy in characterizing cardiovascular\nhealth. The model's performance was validated on a 40% held-out test set,\nachieving state-of-the-art results, especially in regions with small sample\nsizes. Furthermore, we divided the population into three subgroups based on the\ndifference between predicted vascular age and chronological age: less than -10\nyears, between -10 and 10 years, and greater than 10 years. We analyzed the\nrelationship between predicted vascular age and several cardiovascular events\nover a follow-up period of up to 10 years, including death, coronary heart\ndisease, and heart failure. Our results indicate that the predicted vascular\nage has significant potential to reflect an individual's cardiovascular health\nstatus. Our code will be available at https://github.com/Ngk03/AI-vascular-age.\n","authors":["Guangkun Nie","Qinghao Zhao","Gongzheng Tang","Jun Li","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2406.14953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02165v1","updated":"2024-07-02T11:17:48Z","published":"2024-07-02T11:17:48Z","title":"WildAvatar: Web-scale In-the-wild Video Dataset for 3D Avatar Creation","summary":" Existing human datasets for avatar creation are typically limited to\nlaboratory environments, wherein high-quality annotations (e.g., SMPL\nestimation from 3D scans or multi-view images) can be ideally provided.\nHowever, their annotating requirements are impractical for real-world images or\nvideos, posing challenges toward real-world applications on current avatar\ncreation methods. To this end, we propose the WildAvatar dataset, a web-scale\nin-the-wild human avatar creation dataset extracted from YouTube, with\n$10,000+$ different human subjects and scenes. WildAvatar is at least\n$10\\times$ richer than previous datasets for 3D human avatar creation. We\nevaluate several state-of-the-art avatar creation methods on our dataset,\nhighlighting the unexplored challenges in real-world applications on avatar\ncreation. We also demonstrate the potential for generalizability of avatar\ncreation methods, when provided with data at scale. We will publicly release\nour data source links and annotations, to push forward 3D human avatar creation\nand other related fields for real-world applications.\n","authors":["Zihao Huang","ShouKang Hu","Guangcong Wang","Tianqi Liu","Yuhang Zang","Zhiguo Cao","Wei Li","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02159v1","updated":"2024-07-02T11:08:51Z","published":"2024-07-02T11:08:51Z","title":"SparseSSP: 3D Subcellular Structure Prediction from Sparse-View\n Transmitted Light Images","summary":" Traditional fluorescence staining is phototoxic to live cells, slow, and\nexpensive; thus, the subcellular structure prediction (SSP) from transmitted\nlight (TL) images is emerging as a label-free, faster, low-cost alternative.\nHowever, existing approaches utilize 3D networks for one-to-one voxel level\ndense prediction, which necessitates a frequent and time-consuming Z-axis\nimaging process. Moreover, 3D convolutions inevitably lead to significant\ncomputation and GPU memory overhead. Therefore, we propose an efficient\nframework, SparseSSP, predicting fluorescent intensities within the target\nvoxel grid in an efficient paradigm instead of relying entirely on 3D\ntopologies. In particular, SparseSSP makes two pivotal improvements to prior\nworks. First, SparseSSP introduces a one-to-many voxel mapping paradigm, which\npermits the sparse TL slices to reconstruct the subcellular structure.\nSecondly, we propose a hybrid dimensions topology, which folds the Z-axis\ninformation into channel features, enabling the 2D network layers to tackle SSP\nunder low computational cost. We conduct extensive experiments to validate the\neffectiveness and advantages of SparseSSP on diverse sparse imaging ratios, and\nour approach achieves a leading performance compared to pure 3D topologies.\nSparseSSP reduces imaging frequencies compared to previous dense-view SSP\n(i.e., the number of imaging is reduced up to 87.5% at most), which is\nsignificant in visualizing rapid biological dynamics on low-cost devices and\nsamples.\n","authors":["Jintu Zheng","YI Ding","Qizhe Liu","Yi Cao","Ying Hu","Zenan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02159v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2303.04968v4","updated":"2024-07-02T11:07:25Z","published":"2023-03-09T01:03:35Z","title":"Reconstruction of Cardiac Cine MRI Using Motion-Guided Deformable\n Alignment and Multi-Resolution Fusion","summary":" Cardiac cine magnetic resonance imaging (MRI) is one of the important means\nto assess cardiac functions and vascular abnormalities. Mitigating artifacts\narising during image reconstruction and accelerating cardiac cine MRI\nacquisition to obtain high-quality images is important. A novel end-to-end deep\nlearning network is developed to improve cardiac cine MRI reconstruction.\nFirst, a U-Net is adopted to obtain the initial reconstructed images in\nk-space. Further to remove the motion artifacts, the motion-guided deformable\nalignment (MGDA) module with second-order bidirectional propagation is\nintroduced to align the adjacent cine MRI frames by maximizing spatial-temporal\ninformation to alleviate motion artifacts. Finally, the multi-resolution fusion\n(MRF) module is designed to correct the blur and artifacts generated from\nalignment operation and obtain the last high-quality reconstructed cardiac\nimages. At an 8$\\times$ acceleration rate, the numerical measurements on the\nACDC dataset are structural similarity index (SSIM) of 78.40%$\\pm$.57%, peak\nsignal-to-noise ratio (PSNR) of 30.46$\\pm$1.22dB, and normalized mean squared\nerror (NMSE) of 0.0468$\\pm$0.0075. On the ACMRI dataset, the results are SSIM\nof 87.65%$\\pm$4.20%, PSNR of 30.04$\\pm$1.18dB, and NMSE of 0.0473$\\pm$0.0072.\nThe proposed method exhibits high-quality results with richer details and fewer\nartifacts for cardiac cine MRI reconstruction on different accelerations.\n","authors":["Xiaoxiang Han","Yang Chen","Qiaohong Liu","Yiman Liu","Keyan Chen","Yuanjie Lin","Weikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.04968v4.pdf","comment":"28 pages, 5 tables, 11 figures"},{"id":"http://arxiv.org/abs/2407.02158v1","updated":"2024-07-02T11:02:19Z","published":"2024-07-02T11:02:19Z","title":"UltraPixel: Advancing Ultra-High-Resolution Image Synthesis to New Peaks","summary":" Ultra-high-resolution image generation poses great challenges, such as\nincreased semantic planning complexity and detail synthesis difficulties,\nalongside substantial training resource demands. We present UltraPixel, a novel\narchitecture utilizing cascade diffusion models to generate high-quality images\nat multiple resolutions (\\textit{e.g.}, 1K to 6K) within a single model, while\nmaintaining computational efficiency. UltraPixel leverages semantics-rich\nrepresentations of lower-resolution images in the later denoising stage to\nguide the whole generation of highly detailed high-resolution images,\nsignificantly reducing complexity. Furthermore, we introduce implicit neural\nrepresentations for continuous upsampling and scale-aware normalization layers\nadaptable to various resolutions. Notably, both low- and high-resolution\nprocesses are performed in the most compact space, sharing the majority of\nparameters with less than 3$\\%$ additional parameters for high-resolution\noutputs, largely enhancing training and inference efficiency. Our model\nachieves fast training with reduced data requirements, producing\nphoto-realistic high-resolution images and demonstrating state-of-the-art\nperformance in extensive experiments.\n","authors":["Jingjing Ren","Wenbo Li","Haoyu Chen","Renjing Pei","Bin Shao","Yong Guo","Long Peng","Fenglong Song","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2407.02158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02157v1","updated":"2024-07-02T10:55:43Z","published":"2024-07-02T10:55:43Z","title":"FineCLIPER: Multi-modal Fine-grained CLIP for Dynamic Facial Expression\n Recognition with AdaptERs","summary":" Dynamic Facial Expression Recognition (DFER) is crucial for understanding\nhuman behavior. However, current methods exhibit limited performance mainly due\nto the scarcity of high-quality data, the insufficient utilization of facial\ndynamics, and the ambiguity of expression semantics, etc. To this end, we\npropose a novel framework, named Multi-modal Fine-grained CLIP for Dynamic\nFacial Expression Recognition with AdaptERs (FineCLIPER), incorporating the\nfollowing novel designs: 1) To better distinguish between similar facial\nexpressions, we extend the class labels to textual descriptions from both\npositive and negative aspects, and obtain supervision by calculating the\ncross-modal similarity based on the CLIP model; 2) Our FineCLIPER adopts a\nhierarchical manner to effectively mine useful cues from DFE videos.\nSpecifically, besides directly embedding video frames as input (low semantic\nlevel), we propose to extract the face segmentation masks and landmarks based\non each frame (middle semantic level) and utilize the Multi-modal Large\nLanguage Model (MLLM) to further generate detailed descriptions of facial\nchanges across frames with designed prompts (high semantic level).\nAdditionally, we also adopt Parameter-Efficient Fine-Tuning (PEFT) to enable\nefficient adaptation of large pre-trained models (i.e., CLIP) for this task.\nOur FineCLIPER achieves SOTA performance on the DFEW, FERV39k, and MAFW\ndatasets in both supervised and zero-shot settings with few tunable parameters.\nAnalysis and ablation studies further validate its effectiveness.\n","authors":["Haodong Chen","Haojian Huang","Junhao Dong","Mingzhe Zheng","Dian Shao"],"pdf_url":"https://arxiv.org/pdf/2407.02157v1.pdf","comment":"Project Page: https://haroldchen19.github.io/FineCLIPER-Page/"},{"id":"http://arxiv.org/abs/2303.16884v2","updated":"2024-07-02T10:48:32Z","published":"2023-03-29T17:53:20Z","title":"Instant Photorealistic Neural Radiance Fields Stylization","summary":" We present Instant Neural Radiance Fields Stylization, a novel approach for\nmulti-view image stylization for the 3D scene. Our approach models a neural\nradiance field based on neural graphics primitives, which use a hash\ntable-based position encoder for position embedding. We split the position\nencoder into two parts, the content and style sub-branches, and train the\nnetwork for normal novel view image synthesis with the content and style\ntargets. In the inference stage, we execute AdaIN to the output features of the\nposition encoder, with content and style voxel grid features as reference. With\nthe adjusted features, the stylization of novel view images could be obtained.\nOur method extends the style target from style images to image sets of scenes\nand does not require additional network training for stylization. Given a set\nof images of 3D scenes and a style target(a style image or another set of 3D\nscenes), our method can generate stylized novel views with a consistent\nappearance at various view angles in less than 10 minutes on modern GPU\nhardware. Extensive experimental results demonstrate the validity and\nsuperiority of our method.\n","authors":["Shaoxu Li","Ye Pan"],"pdf_url":"https://arxiv.org/pdf/2303.16884v2.pdf","comment":"Accepted by ICASSP2024. Code:\n https://github.com/lishaoxu1994/Instant-NeRF-Stylization"},{"id":"http://arxiv.org/abs/2407.02150v1","updated":"2024-07-02T10:47:50Z","published":"2024-07-02T10:47:50Z","title":"VRBiom: A New Periocular Dataset for Biometric Applications of HMD","summary":" With advancements in hardware, high-quality HMD devices are being developed\nby numerous companies, driving increased consumer interest in AR, VR, and MR\napplications. In this work, we present a new dataset, called VRBiom, of\nperiocular videos acquired using a Virtual Reality headset. The VRBiom,\ntargeted at biometric applications, consists of 900 short videos acquired from\n25 individuals recorded in the NIR spectrum. These 10s long videos have been\ncaptured using the internal tracking cameras of Meta Quest Pro at 72 FPS. To\nencompass real-world variations, the dataset includes recordings under three\ngaze conditions: steady, moving, and partially closed eyes. We have also\nensured an equal split of recordings without and with glasses to facilitate the\nanalysis of eye-wear. These videos, characterized by non-frontal views of the\neye and relatively low spatial resolutions (400 x 400), can be instrumental in\nadvancing state-of-the-art research across various biometric applications. The\nVRBiom dataset can be utilized to evaluate, train, or adapt models for\nbiometric use-cases such as iris and/or periocular recognition and associated\nsub-tasks such as detection and semantic segmentation.\n In addition to data from real individuals, we have included around 1100 PA\nconstructed from 92 PA instruments. These PAIs fall into six categories\nconstructed through combinations of print attacks (real and synthetic\nidentities), fake 3D eyeballs, plastic eyes, and various types of masks and\nmannequins. These PA videos, combined with genuine (bona-fide) data, can be\nutilized to address concerns related to spoofing, which is a significant threat\nif these devices are to be used for authentication.\n The VRBiom dataset is publicly available for research purposes related to\nbiometric applications only.\n","authors":["Ketan Kotwal","Ibrahim Ulucan","Gokhan Ozbulak","Janani Selliah","Sebastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2407.02150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02123v1","updated":"2024-07-02T10:14:00Z","published":"2024-07-02T10:14:00Z","title":"Hybrid Feature Collaborative Reconstruction Network for Few-Shot\n Fine-Grained Image Classification","summary":" Our research focuses on few-shot fine-grained image classification, which\nfaces two major challenges: appearance similarity of fine-grained objects and\nlimited number of samples. To preserve the appearance details of images,\ntraditional feature reconstruction networks usually enhance the representation\nability of key features by spatial feature reconstruction and minimizing the\nreconstruction error. However, we find that relying solely on a single type of\nfeature is insufficient for accurately capturing inter-class differences of\nfine-grained objects in scenarios with limited samples. In contrast, the\nintroduction of channel features provides additional information dimensions,\naiding in better understanding and distinguishing the inter-class differences\nof fine-grained objects. Therefore, in this paper, we design a new Hybrid\nFeature Collaborative Reconstruction Network (HFCR-Net) for few-shot\nfine-grained image classification, which includes a Hybrid Feature Fusion\nProcess (HFFP) and a Hybrid Feature Reconstruction Process (HFRP). In HFRP, we\nfuse the channel features and the spatial features. Through dynamic weight\nadjustment, we aggregate the spatial dependencies between arbitrary two\npositions and the correlations between different channels of each image to\nincrease the inter-class differences. Additionally, we introduce the\nreconstruction of channel dimension in HFRP. Through the collaborative\nreconstruction of channel dimension and spatial dimension, the inter-class\ndifferences are further increased in the process of support-to-query\nreconstruction, while the intra-class differences are reduced in the process of\nquery-to-support reconstruction. Ultimately, our extensive experiments on three\nwidely used fine-grained datasets demonstrate the effectiveness and superiority\nof our approach.\n","authors":["Shulei Qiu","Wanqi Yang","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05646v3","updated":"2024-07-02T09:56:26Z","published":"2024-01-11T03:47:13Z","title":"Masked Attribute Description Embedding for Cloth-Changing Person\n Re-identification","summary":" Cloth-changing person re-identification (CC-ReID) aims to match persons who\nchange clothes over long periods. The key challenge in CC-ReID is to extract\nclothing-independent features, such as face, hairstyle, body shape, and gait.\nCurrent research mainly focuses on modeling body shape using multi-modal\nbiological features (such as silhouettes and sketches). However, it does not\nfully leverage the personal description information hidden in the original RGB\nimage. Considering that there are certain attribute descriptions which remain\nunchanged after the changing of cloth, we propose a Masked Attribute\nDescription Embedding (MADE) method that unifies personal visual appearance and\nattribute description for CC-ReID. Specifically, handling variable\nclothing-sensitive information, such as color and type, is challenging for\neffective modeling. To address this, we mask the clothing and color information\nin the personal attribute description extracted through an attribute detection\nmodel. The masked attribute description is then connected and embedded into\nTransformer blocks at various levels, fusing it with the low-level to\nhigh-level features of the image. This approach compels the model to discard\nclothing information. Experiments are conducted on several CC-ReID benchmarks,\nincluding PRCC, LTCC, Celeb-reID-light, and LaST. Results demonstrate that MADE\neffectively utilizes attribute description, enhancing cloth-changing person\nre-identification performance, and compares favorably with state-of-the-art\nmethods. The code is available at https://github.com/moon-wh/MADE.\n","authors":["Chunlei Peng","Boyu Wang","Decheng Liu","Nannan Wang","Ruimin Hu","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2401.05646v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07487v2","updated":"2024-07-02T09:55:50Z","published":"2024-06-11T17:27:23Z","title":"GLAD: Towards Better Reconstruction with Global and Local Adaptive\n Diffusion Models for Unsupervised Anomaly Detection","summary":" Diffusion models have shown superior performance on unsupervised anomaly\ndetection tasks. Since trained with normal data only, diffusion models tend to\nreconstruct normal counterparts of test images with certain noises added.\nHowever, these methods treat all potential anomalies equally, which may cause\ntwo main problems. From the global perspective, the difficulty of\nreconstructing images with different anomalies is uneven. Therefore, instead of\nutilizing the same setting for all samples, we propose to predict a particular\ndenoising step for each sample by evaluating the difference between image\ncontents and the priors extracted from diffusion models. From the local\nperspective, reconstructing abnormal regions differs from normal areas even in\nthe same image. Theoretically, the diffusion model predicts a noise for each\nstep, typically following a standard Gaussian distribution. However, due to the\ndifference between the anomaly and its potential normal counterpart, the\npredicted noise in abnormal regions will inevitably deviate from the standard\nGaussian distribution. To this end, we propose introducing synthetic abnormal\nsamples in training to encourage the diffusion models to break through the\nlimitation of standard Gaussian distribution, and a spatial-adaptive feature\nfusion scheme is utilized during inference. With the above modifications, we\npropose a global and local adaptive diffusion model (abbreviated to GLAD) for\nunsupervised anomaly detection, which introduces appealing flexibility and\nachieves anomaly-free reconstruction while retaining as much normal information\nas possible. Extensive experiments are conducted on three commonly used anomaly\ndetection datasets (MVTec-AD, MPDD, and VisA) and a printed circuit board\ndataset (PCB-Bank) we integrated, showing the effectiveness of the proposed\nmethod.\n","authors":["Hang Yao","Ming Liu","Haolin Wang","Zhicun Yin","Zifei Yan","Xiaopeng Hong","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2406.07487v2.pdf","comment":"Accepted by ECCV 2024, code and models:\n https://github.com/hyao1/GLAD. Due to the limitation \"The abstract field\n cannot be longer than 1,920 characters\", the abstract here is shorter than\n that in the PDF file"},{"id":"http://arxiv.org/abs/2407.01247v2","updated":"2024-07-02T09:55:26Z","published":"2024-07-01T12:49:55Z","title":"Multi-level Reliable Guidance for Unpaired Multi-view Clustering","summary":" In this paper, we address the challenging problem of unpaired multi-view\nclustering (UMC), aiming to perform effective joint clustering using unpaired\nobserved samples across multiple views. Commonly, traditional incomplete\nmulti-view clustering (IMC) methods often depend on paired samples to capture\ncomplementary information between views. However, the strategy becomes\nimpractical in UMC due to the absence of paired samples. Although some\nresearchers have attempted to tackle the issue by preserving consistent cluster\nstructures across views, they frequently neglect the confidence of these\ncluster structures, especially for boundary samples and uncertain cluster\nstructures during the initial training. Therefore, we propose a method called\nMulti-level Reliable Guidance for UMC (MRG-UMC), which leverages multi-level\nclustering to aid in learning a trustworthy cluster structure across\ninner-view, cross-view, and common-view, respectively. Specifically, within\neach view, multi-level clustering fosters a trustworthy cluster structure\nacross different levels and reduces clustering error. In cross-view learning,\nreliable view guidance enhances the confidence of the cluster structures in\nother views. Similarly, within the multi-level framework, the incorporation of\na common view aids in aligning different views, thereby reducing the clustering\nerror and uncertainty of cluster structure. Finally, as evidenced by extensive\nexperiments, our method for UMC demonstrates significant efficiency\nimprovements compared to 20 state-of-the-art methods.\n","authors":["Like Xin","Wanqi Yang","Lei Wang","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02109v1","updated":"2024-07-02T09:51:56Z","published":"2024-07-02T09:51:56Z","title":"HRSAM: Efficiently Segment Anything in High-Resolution Images","summary":" The Segment Anything Model (SAM) has significantly advanced interactive\nsegmentation but struggles with high-resolution images crucial for\nhigh-precision segmentation. This is primarily due to the quadratic space\ncomplexity of SAM-implemented attention and the length extrapolation issue in\ncommon global attention. This study proposes HRSAM that integrates Flash\nAttention and incorporates Plain, Shifted and newly proposed Cycle-scan Window\n(PSCWin) attention to address these issues. The shifted window attention is\nredesigned with padding to maintain consistent window sizes, enabling effective\nlength extrapolation. The cycle-scan window attention adopts the recently\ndeveloped State Space Models (SSMs) to ensure global information exchange with\nminimal computational overhead. Such window-based attention allows HRSAM to\nperform effective attention computations on scaled input images while\nmaintaining low latency. Moreover, we further propose HRSAM++ that additionally\nemploys a multi-scale strategy to enhance HRSAM's performance. The experiments\non the high-precision segmentation datasets HQSeg44K and DAVIS show that\nhigh-resolution inputs enable the SAM-distilled HRSAM models to outperform the\nteacher model while maintaining lower latency. Compared to the SOTAs, HRSAM\nachieves a 1.56 improvement in interactive segmentation's NoC95 metric with\nonly 31% of the latency. HRSAM++ further enhances the performance, achieving a\n1.63 improvement in NoC95 with just 38% of the latency.\n","authors":["You Huang","Wenbin Lai","Jiayi Ji","Liujuan Cao","Shengchuan Zhang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.02109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02104v1","updated":"2024-07-02T09:43:47Z","published":"2024-07-02T09:43:47Z","title":"Joint-Dataset Learning and Cross-Consistent Regularization for\n Text-to-Motion Retrieval","summary":" Pose-estimation methods enable extracting human motion from common videos in\nthe structured form of 3D skeleton sequences. Despite great application\nopportunities, effective content-based access to such spatio-temporal motion\ndata is a challenging problem. In this paper, we focus on the recently\nintroduced text-motion retrieval tasks, which aim to search for database\nmotions that are the most relevant to a specified natural-language textual\ndescription (text-to-motion) and vice-versa (motion-to-text). Despite recent\nefforts to explore these promising avenues, a primary challenge remains the\ninsufficient data available to train robust text-motion models effectively. To\naddress this issue, we propose to investigate joint-dataset learning - where we\ntrain on multiple text-motion datasets simultaneously - together with the\nintroduction of a Cross-Consistent Contrastive Loss function (CCCL), which\nregularizes the learned text-motion common space by imposing uni-modal\nconstraints that augment the representation ability of the trained network. To\nlearn a proper motion representation, we also introduce a transformer-based\nmotion encoder, called MoT++, which employs spatio-temporal attention to\nprocess sequences of skeleton data. We demonstrate the benefits of the proposed\napproaches on the widely-used KIT Motion-Language and HumanML3D datasets. We\nperform detailed experimentation on joint-dataset learning and cross-dataset\nscenarios, showing the effectiveness of each introduced module in a carefully\nconducted ablation study and, in turn, pointing out the limitations of\nstate-of-the-art methods.\n","authors":["Nicola Messina","Jan Sedmidubsky","Fabrizio Falchi","Tomáš Rebok"],"pdf_url":"https://arxiv.org/pdf/2407.02104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18461v2","updated":"2024-07-02T09:36:26Z","published":"2024-03-27T11:19:34Z","title":"DiffStyler: Diffusion-based Localized Image Style Transfer","summary":" Image style transfer aims to imbue digital imagery with the distinctive\nattributes of style targets, such as colors, brushstrokes, shapes, whilst\nconcurrently preserving the semantic integrity of the content. Despite the\nadvancements in arbitrary style transfer methods, a prevalent challenge remains\nthe delicate equilibrium between content semantics and style attributes. Recent\ndevelopments in large-scale text-to-image diffusion models have heralded\nunprecedented synthesis capabilities, albeit at the expense of relying on\nextensive and often imprecise textual descriptions to delineate artistic\nstyles. Addressing these limitations, this paper introduces DiffStyler, a novel\napproach that facilitates efficient and precise arbitrary image style transfer.\nDiffStyler lies the utilization of a text-to-image Stable Diffusion model-based\nLoRA to encapsulate the essence of style targets. This approach, coupled with\nstrategic cross-LoRA feature and attention injection, guides the style transfer\nprocess. The foundation of our methodology is rooted in the observation that\nLoRA maintains the spatial feature consistency of UNet, a discovery that\nfurther inspired the development of a mask-wise style transfer technique. This\ntechnique employs masks extracted through a pre-trained FastSAM model,\nutilizing mask prompts to facilitate feature fusion during the denoising\nprocess, thereby enabling localized style transfer that preserves the original\nimage's unaffected regions. Moreover, our approach accommodates multiple style\ntargets through the use of corresponding masks. Through extensive\nexperimentation, we demonstrate that DiffStyler surpasses previous methods in\nachieving a more harmonious balance between content preservation and style\nintegration.\n","authors":["Shaoxu Li"],"pdf_url":"https://arxiv.org/pdf/2403.18461v2.pdf","comment":"https://github.com/lishaoxu1994/DiffStyler"},{"id":"http://arxiv.org/abs/2407.02098v1","updated":"2024-07-02T09:33:32Z","published":"2024-07-02T09:33:32Z","title":"DM3D: Distortion-Minimized Weight Pruning for Lossless 3D Object\n Detection","summary":" Applying deep neural networks to 3D point cloud processing has attracted\nincreasing attention due to its advanced performance in many areas, such as\nAR/VR, autonomous driving, and robotics. However, as neural network models and\n3D point clouds expand in size, it becomes a crucial challenge to reduce the\ncomputational and memory overhead to meet latency and energy constraints in\nreal-world applications. Although existing approaches have proposed to reduce\nboth computational cost and memory footprint, most of them only address the\nspatial redundancy in inputs, i.e. removing the redundancy of background points\nin 3D data. In this paper, we propose a novel post-training weight pruning\nscheme for 3D object detection that is (1) orthogonal to all existing point\ncloud sparsifying methods, which determines redundant parameters in the\npretrained model that lead to minimal distortion in both locality and\nconfidence (detection distortion); and (2) a universal plug-and-play pruning\nframework that works with arbitrary 3D detection model. This framework aims to\nminimize detection distortion of network output to maximally maintain detection\nprecision, by identifying layer-wise sparsity based on second-order Taylor\napproximation of the distortion. Albeit utilizing second-order information, we\nintroduced a lightweight scheme to efficiently acquire Hessian information, and\nsubsequently perform dynamic programming to solve the layer-wise sparsity.\nExtensive experiments on KITTI, Nuscenes and ONCE datasets demonstrate that our\napproach is able to maintain and even boost the detection precision on pruned\nmodel under noticeable computation reduction (FLOPs). Noticeably, we achieve\nover 3.89x, 3.72x FLOPs reduction on CenterPoint and PVRCNN model,\nrespectively, without mAP decrease, significantly improving the\nstate-of-the-art.\n","authors":["Kaixin Xu","Qingtian Feng","Hao Chen","Zhe Wang","Xue Geng","Xulei Yang","Min Wu","Xiaoli Li","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2407.02098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01210v5","updated":"2024-07-02T09:31:04Z","published":"2023-10-02T13:55:06Z","title":"Towards Robust Cardiac Segmentation using Graph Convolutional Networks","summary":" Fully automatic cardiac segmentation can be a fast and reproducible method to\nextract clinical measurements from an echocardiography examination. The U-Net\narchitecture is the current state-of-the-art deep learning architecture for\nmedical segmentation and can segment cardiac structures in real-time with\naverage errors comparable to inter-observer variability. However, this\narchitecture still generates large outliers that are often anatomically\nincorrect. This work uses the concept of graph convolutional neural networks\nthat predict the contour points of the structures of interest instead of\nlabeling each pixel. We propose a graph architecture that uses two\nconvolutional rings based on cardiac anatomy and show that this eliminates\nanatomical incorrect multi-structure segmentations on the publicly available\nCAMUS dataset. Additionally, this work contributes with an ablation study on\nthe graph convolutional architecture and an evaluation of clinical measurements\non the clinical HUNT4 dataset. Finally, we propose to use the inter-model\nagreement of the U-Net and the graph network as a predictor of both the input\nand segmentation quality. We show this predictor can detect out-of-distribution\nand unsuitable input images in real-time. Source code is available online:\nhttps://github.com/gillesvntnu/GCN_multistructure\n","authors":["Gilles Van De Vyver","Sarina Thomas","Guy Ben-Yosef","Sindre Hellum Olaisen","Håvard Dalen","Lasse Løvstakken","Erik Smistad"],"pdf_url":"https://arxiv.org/pdf/2310.01210v5.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.02078v1","updated":"2024-07-02T09:12:54Z","published":"2024-07-02T09:12:54Z","title":"MARLIN: A Cloud Integrated Robotic Solution to Support Intralogistics in\n Retail","summary":" In this paper, we present the service robot MARLIN and its integration with\nthe K4R platform, a cloud system for complex AI applications in retail. At its\ncore, this platform contains so-called semantic digital twins, a semantically\nannotated representation of the retail store. MARLIN continuously exchanges\ndata with the K4R platform, improving the robot's capabilities in perception,\nautonomous navigation, and task planning. We exploit these capabilities in a\nretail intralogistics scenario, specifically by assisting store employees in\nstocking shelves. We demonstrate that MARLIN is able to update the digital\nrepresentation of the retail store by detecting and classifying obstacles,\nautonomously planning and executing replenishment missions, adapting to\nunforeseen changes in the environment, and interacting with store employees.\nExperiments are conducted in simulation, in a laboratory environment, and in a\nreal store. We also describe and evaluate a novel algorithm for autonomous\nnavigation of articulated tractor-trailer systems. The algorithm outperforms\nthe manufacturer's proprietary navigation approach and improves MARLIN's\nnavigation capabilities in confined spaces.\n","authors":["Dennis Mronga","Andreas Bresser","Fabian Maas","Adrian Danzglock","Simon Stelter","Alina Hawkin","Hoang Giang Nguyen","Michael Beetz","Frank Kirchner"],"pdf_url":"https://arxiv.org/pdf/2407.02078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02077v1","updated":"2024-07-02T09:11:17Z","published":"2024-07-02T09:11:17Z","title":"Hierarchical Temporal Context Learning for Camera-based Semantic Scene\n Completion","summary":" Camera-based 3D semantic scene completion (SSC) is pivotal for predicting\ncomplicated 3D layouts with limited 2D image observations. The existing\nmainstream solutions generally leverage temporal information by roughly\nstacking history frames to supplement the current frame, such straightforward\ntemporal modeling inevitably diminishes valid clues and increases learning\ndifficulty. To address this problem, we present HTCL, a novel Hierarchical\nTemporal Context Learning paradigm for improving camera-based semantic scene\ncompletion. The primary innovation of this work involves decomposing temporal\ncontext learning into two hierarchical steps: (a) cross-frame affinity\nmeasurement and (b) affinity-based dynamic refinement. Firstly, to separate\ncritical relevant context from redundant information, we introduce the pattern\naffinity with scale-aware isolation and multiple independent learners for\nfine-grained contextual correspondence modeling. Subsequently, to dynamically\ncompensate for incomplete observations, we adaptively refine the feature\nsampling locations based on initially identified locations with high affinity\nand their neighboring relevant regions. Our method ranks $1^{st}$ on the\nSemanticKITTI benchmark and even surpasses LiDAR-based methods in terms of mIoU\non the OpenOccupancy benchmark. Our code is available on\nhttps://github.com/Arlo0o/HTCL.\n","authors":["Bohan Li","Jiajun Deng","Wenyao Zhang","Zhujin Liang","Dalong Du","Xin Jin","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2407.02077v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2312.00592v3","updated":"2024-07-02T09:09:19Z","published":"2023-12-01T13:56:28Z","title":"Tracking Object Positions in Reinforcement Learning: A Metric for\n Keypoint Detection (extended version)","summary":" Reinforcement learning (RL) for robot control typically requires a detailed\nrepresentation of the environment state, including information about\ntask-relevant objects not directly measurable. Keypoint detectors, such as\nspatial autoencoders (SAEs), are a common approach to extracting a\nlow-dimensional representation from high-dimensional image data. SAEs aim at\nspatial features such as object positions, which are often useful\nrepresentations in robotic RL. However, whether an SAE is actually able to\ntrack objects in the scene and thus yields a spatial state representation well\nsuited for RL tasks has rarely been examined due to a lack of established\nmetrics. In this paper, we propose to assess the performance of an SAE instance\nby measuring how well keypoints track ground truth objects in images. We\npresent a computationally lightweight metric and use it to evaluate common\nbaseline SAE architectures on image data from a simulated robot task. We find\nthat common SAEs differ substantially in their spatial extraction capability.\nFurthermore, we validate that SAEs that perform well in our metric achieve\nsuperior performance when used in downstream RL. Thus, our metric is an\neffective and lightweight indicator of RL performance before executing\nexpensive RL training. Building on these insights, we identify three key\nmodifications of SAE architectures to improve tracking performance.\n","authors":["Emma Cramer","Jonas Reiher","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2312.00592v3.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.02075v1","updated":"2024-07-02T09:08:06Z","published":"2024-07-02T09:08:06Z","title":"Label Anything: Multi-Class Few-Shot Semantic Segmentation with Visual\n Prompts","summary":" We present Label Anything, an innovative neural network architecture designed\nfor few-shot semantic segmentation (FSS) that demonstrates remarkable\ngeneralizability across multiple classes with minimal examples required per\nclass. Diverging from traditional FSS methods that predominantly rely on masks\nfor annotating support images, Label Anything introduces varied visual prompts\n-- points, bounding boxes, and masks -- thereby enhancing the framework's\nversatility and adaptability. Unique to our approach, Label Anything is\nengineered for end-to-end training across multi-class FSS scenarios,\nefficiently learning from diverse support set configurations without\nretraining. This approach enables a \"universal\" application to various FSS\nchallenges, ranging from $1$-way $1$-shot to complex $N$-way $K$-shot\nconfigurations while remaining agnostic to the specific number of class\nexamples. This innovative training strategy reduces computational requirements\nand substantially improves the model's adaptability and generalization across\ndiverse segmentation tasks. Our comprehensive experimental validation,\nparticularly achieving state-of-the-art results on the COCO-$20^i$ benchmark,\nunderscores Label Anything's robust generalization and flexibility. The source\ncode is publicly available at: https://github.com/pasqualedem/LabelAnything.\n","authors":["Pasquale De Marinis","Nicola Fanelli","Raffaele Scaringi","Emanuele Colonna","Giuseppe Fiameni","Gennaro Vessio","Giovanna Castellano"],"pdf_url":"https://arxiv.org/pdf/2407.02075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02068v1","updated":"2024-07-02T08:58:19Z","published":"2024-07-02T08:58:19Z","title":"LPViT: Low-Power Semi-structured Pruning for Vision Transformers","summary":" Vision transformers have emerged as a promising alternative to convolutional\nneural networks for various image analysis tasks, offering comparable or\nsuperior performance. However, one significant drawback of ViTs is their\nresource-intensive nature, leading to increased memory footprint, computation\ncomplexity, and power consumption. To democratize this high-performance\ntechnology and make it more environmentally friendly, it is essential to\ncompress ViT models, reducing their resource requirements while maintaining\nhigh performance. In this paper, we introduce a new block-structured pruning to\naddress the resource-intensive issue for ViTs, offering a balanced trade-off\nbetween accuracy and hardware acceleration. Unlike unstructured pruning or\nchannel-wise structured pruning, block pruning leverages the block-wise\nstructure of linear layers, resulting in more efficient matrix multiplications.\nTo optimize this pruning scheme, our paper proposes a novel hardware-aware\nlearning objective that simultaneously maximizes speedup and minimizes power\nconsumption during inference, tailored to the block sparsity structure. This\nobjective eliminates the need for empirical look-up tables and focuses solely\non reducing parametrized layer connections. Moreover, our paper provides a\nlightweight algorithm to achieve post-training pruning for ViTs, utilizing\nsecond-order Taylor approximation and empirical optimization to solve the\nproposed hardware-aware objective. Extensive experiments on ImageNet are\nconducted across various ViT architectures, including DeiT-B and DeiT-S,\ndemonstrating competitive performance with other pruning methods and achieving\na remarkable balance between accuracy preservation and power savings.\nEspecially, we achieve up to 3.93x and 1.79x speedups on dedicated hardware and\nGPUs respectively for DeiT-B, and also observe an inference power reduction by\n1.4x on real-world GPUs.\n","authors":["Kaixin Xu","Zhe Wang","Chunyun Chen","Xue Geng","Jie Lin","Xulei Yang","Min Wu","Xiaoli Li","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2407.02068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18576v4","updated":"2024-07-02T08:53:18Z","published":"2023-11-30T14:15:39Z","title":"Fixed-length Dense Descriptor for Efficient Fingerprint Matching","summary":" In fingerprint matching, fixed-length descriptors generally offer greater\nefficiency compared to minutiae set, but the recognition accuracy is not as\ngood as that of the latter. Although much progress has been made in deep\nlearning based fixed-length descriptors recently, they often fall short when\ndealing with incomplete or partial fingerprints, diverse fingerprint poses, and\nsignificant background noise. In this paper, we propose a three-dimensional\nrepresentation called Fixed-length Dense Descriptor (FDD) for efficient\nfingerprint matching. FDD features great spatial properties, enabling it to\ncapture the spatial relationships of the original fingerprints, thereby\nenhancing interpretability and robustness. Our experiments on various\nfingerprint datasets reveal that FDD outperforms other fixed-length\ndescriptors, especially in matching fingerprints of different areas,\ncross-modal fingerprint matching, and fingerprint matching with background\nnoise.\n","authors":["Zhiyu Pan","Yongjie Duan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v4.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.01299v2","updated":"2024-07-02T08:39:21Z","published":"2024-07-01T13:54:59Z","title":"Preserving Full Degradation Details for Blind Image Super-Resolution","summary":" The performance of image super-resolution relies heavily on the accuracy of\ndegradation information, especially under blind settings. Due to absence of\ntrue degradation models in real-world scenarios, previous methods learn\ndistinct representations by distinguishing different degradations in a batch.\nHowever, the most significant degradation differences may provide shortcuts for\nthe learning of representations such that subtle difference may be discarded.\nIn this paper, we propose an alternative to learn degradation representations\nthrough reproducing degraded low-resolution (LR) images. By guiding the\ndegrader to reconstruct input LR images, full degradation information can be\nencoded into the representations. In addition, we develop an energy distance\nloss to facilitate the learning of the degradation representations by\nintroducing a bounded constraint. Experiments show that our representations can\nextract accurate and highly robust degradation information. Moreover,\nevaluations on both synthetic and real images demonstrate that our ReDSR\nachieves state-of-the-art performance for the blind SR tasks.\n","authors":["Hongda Liu","Longguang Wang","Ye Zhang","Kaiwen Xue","Shunbo Zhou","Yulan Guo"],"pdf_url":"https://arxiv.org/pdf/2407.01299v2.pdf","comment":"18 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.12800v2","updated":"2024-07-02T08:36:58Z","published":"2024-03-19T15:01:18Z","title":"Learning Neural Volumetric Pose Features for Camera Localization","summary":" We introduce a novel neural volumetric pose feature, termed PoseMap, designed\nto enhance camera localization by encapsulating the information between images\nand the associated camera poses. Our framework leverages an Absolute Pose\nRegression (APR) architecture, together with an augmented NeRF module. This\nintegration not only facilitates the generation of novel views to enrich the\ntraining dataset but also enables the learning of effective pose features.\nAdditionally, we extend our architecture for self-supervised online alignment,\nallowing our method to be used and fine-tuned for unlabelled images within a\nunified framework. Experiments demonstrate that our method achieves 14.28% and\n20.51% performance gain on average in indoor and outdoor benchmark scenes,\noutperforming existing APR methods with state-of-the-art accuracy.\n","authors":["Jingyu Lin","Jiaqi Gu","Bojian Wu","Lubin Fan","Renjie Chen","Ligang Liu","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2403.12800v2.pdf","comment":"14 pages, 9 figures. Accepted at ECCV 2024. Project page are coming\n soon"},{"id":"http://arxiv.org/abs/2402.03307v3","updated":"2024-07-02T08:33:07Z","published":"2024-02-05T18:59:04Z","title":"4D-Rotor Gaussian Splatting: Towards Efficient Novel View Synthesis for\n Dynamic Scenes","summary":" We consider the problem of novel-view synthesis (NVS) for dynamic scenes.\nRecent neural approaches have accomplished exceptional NVS results for static\n3D scenes, but extensions to 4D time-varying scenes remain non-trivial. Prior\nefforts often encode dynamics by learning a canonical space plus implicit or\nexplicit deformation fields, which struggle in challenging scenarios like\nsudden movements or generating high-fidelity renderings. In this paper, we\nintroduce 4D Gaussian Splatting (4DRotorGS), a novel method that represents\ndynamic scenes with anisotropic 4D XYZT Gaussians, inspired by the success of\n3D Gaussian Splatting in static scenes. We model dynamics at each timestamp by\ntemporally slicing the 4D Gaussians, which naturally compose dynamic 3D\nGaussians and can be seamlessly projected into images. As an explicit\nspatial-temporal representation, 4DRotorGS demonstrates powerful capabilities\nfor modeling complicated dynamics and fine details--especially for scenes with\nabrupt motions. We further implement our temporal slicing and splatting\ntechniques in a highly optimized CUDA acceleration framework, achieving\nreal-time inference rendering speeds of up to 277 FPS on an RTX 3090 GPU and\n583 FPS on an RTX 4090 GPU. Rigorous evaluations on scenes with diverse motions\nshowcase the superior efficiency and effectiveness of 4DRotorGS, which\nconsistently outperforms existing methods both quantitatively and\nqualitatively.\n","authors":["Yuanxing Duan","Fangyin Wei","Qiyu Dai","Yuhang He","Wenzheng Chen","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03307v3.pdf","comment":"Proc. SIGGRAPH, 2024"},{"id":"http://arxiv.org/abs/2407.02047v1","updated":"2024-07-02T08:19:48Z","published":"2024-07-02T08:19:48Z","title":"CountFormer: Multi-View Crowd Counting Transformer","summary":" Multi-view counting (MVC) methods have shown their superiority over\nsingle-view counterparts, particularly in situations characterized by heavy\nocclusion and severe perspective distortions. However, hand-crafted heuristic\nfeatures and identical camera layout requirements in conventional MVC methods\nlimit their applicability and scalability in real-world scenarios.In this work,\nwe propose a concise 3D MVC framework called \\textbf{CountFormer}to elevate\nmulti-view image-level features to a scene-level volume representation and\nestimate the 3D density map based on the volume features. By incorporating a\ncamera encoding strategy, CountFormer successfully embeds camera parameters\ninto the volume query and image-level features, enabling it to handle various\ncamera layouts with significant differences.Furthermore, we introduce a feature\nlifting module capitalized on the attention mechanism to transform image-level\nfeatures into a 3D volume representation for each camera view. Subsequently,\nthe multi-view volume aggregation module attentively aggregates various\nmulti-view volumes to create a comprehensive scene-level volume representation,\nallowing CountFormer to handle images captured by arbitrary dynamic camera\nlayouts. The proposed method performs favorably against the state-of-the-art\napproaches across various widely used datasets, demonstrating its greater\nsuitability for real-world deployment compared to conventional MVC frameworks.\n","authors":["Hong Mo","Xiong Zhang","Jianchao Tan","Cheng Yang","Qiong Gu","Bo Hang","Wenqi Ren"],"pdf_url":"https://arxiv.org/pdf/2407.02047v1.pdf","comment":"Accepted By ECCV2024"},{"id":"http://arxiv.org/abs/2407.02040v1","updated":"2024-07-02T08:12:14Z","published":"2024-07-02T08:12:14Z","title":"ScaleDreamer: Scalable Text-to-3D Synthesis with Asynchronous Score\n Distillation","summary":" By leveraging the text-to-image diffusion priors, score distillation can\nsynthesize 3D contents without paired text-3D training data. Instead of\nspending hours of online optimization per text prompt, recent studies have been\nfocused on learning a text-to-3D generative network for amortizing multiple\ntext-3D relations, which can synthesize 3D contents in seconds. However,\nexisting score distillation methods are hard to scale up to a large amount of\ntext prompts due to the difficulties in aligning pretrained diffusion prior\nwith the distribution of rendered images from various text prompts. Current\nstate-of-the-arts such as Variational Score Distillation finetune the\npretrained diffusion model to minimize the noise prediction error so as to\nalign the distributions, which are however unstable to train and will impair\nthe model's comprehension capability to numerous text prompts. Based on the\nobservation that the diffusion models tend to have lower noise prediction\nerrors at earlier timesteps, we propose Asynchronous Score Distillation (ASD),\nwhich minimizes the noise prediction error by shifting the diffusion timestep\nto earlier ones. ASD is stable to train and can scale up to 100k prompts. It\nreduces the noise prediction error without changing the weights of pre-trained\ndiffusion model, thus keeping its strong comprehension capability to prompts.\nWe conduct extensive experiments across different 2D diffusion models,\nincluding Stable Diffusion and MVDream, and text-to-3D generators, including\nHyper-iNGP, 3DConv-Net and Triplane-Transformer. The results demonstrate ASD's\neffectiveness in stable 3D generator training, high-quality 3D content\nsynthesis, and its superior prompt-consistency, especially under large prompt\ncorpus.\n","authors":["Zhiyuan Ma","Yuxiang Wei","Yabin Zhang","Xiangyu Zhu","Zhen Lei","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02040v1.pdf","comment":"Accepted by ECCV 2024. Code available at\n https://github.com/theEricMa/ScaleDreamer"},{"id":"http://arxiv.org/abs/2407.02038v1","updated":"2024-07-02T08:10:37Z","published":"2024-07-02T08:10:37Z","title":"Camera-LiDAR Cross-modality Gait Recognition","summary":" Gait recognition is a crucial biometric identification technique.\nCamera-based gait recognition has been widely applied in both research and\nindustrial fields. LiDAR-based gait recognition has also begun to evolve most\nrecently, due to the provision of 3D structural information. However, in\ncertain applications, cameras fail to recognize persons, such as in low-light\nenvironments and long-distance recognition scenarios, where LiDARs work well.\nOn the other hand, the deployment cost and complexity of LiDAR systems limit\nits wider application. Therefore, it is essential to consider cross-modality\ngait recognition between cameras and LiDARs for a broader range of\napplications. In this work, we propose the first cross-modality gait\nrecognition framework between Camera and LiDAR, namely CL-Gait. It employs a\ntwo-stream network for feature embedding of both modalities. This poses a\nchallenging recognition task due to the inherent matching between 3D and 2D\ndata, exhibiting significant modality discrepancy. To align the feature spaces\nof the two modalities, i.e., camera silhouettes and LiDAR points, we propose a\ncontrastive pre-training strategy to mitigate modality discrepancy. To make up\nfor the absence of paired camera-LiDAR data for pre-training, we also introduce\na strategy for generating data on a large scale. This strategy utilizes\nmonocular depth estimated from single RGB images and virtual cameras to\ngenerate pseudo point clouds for contrastive pre-training. Extensive\nexperiments show that the cross-modality gait recognition is very challenging\nbut still contains potential and feasibility with our proposed model and\npre-training strategy. To the best of our knowledge, this is the first work to\naddress cross-modality gait recognition.\n","authors":["Wenxuan Guo","Yingping Liang","Zhiyu Pan","Ziheng Xi","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02034v1","updated":"2024-07-02T08:06:58Z","published":"2024-07-02T08:06:58Z","title":"TrAME: Trajectory-Anchored Multi-View Editing for Text-Guided 3D\n Gaussian Splatting Manipulation","summary":" Despite significant strides in the field of 3D scene editing, current methods\nencounter substantial challenge, particularly in preserving 3D consistency in\nmulti-view editing process. To tackle this challenge, we propose a progressive\n3D editing strategy that ensures multi-view consistency via a\nTrajectory-Anchored Scheme (TAS) with a dual-branch editing mechanism.\nSpecifically, TAS facilitates a tightly coupled iterative process between 2D\nview editing and 3D updating, preventing error accumulation yielded from\ntext-to-image process. Additionally, we explore the relationship between\noptimization-based methods and reconstruction-based methods, offering a unified\nperspective for selecting superior design choice, supporting the rationale\nbehind the designed TAS. We further present a tuning-free View-Consistent\nAttention Control (VCAC) module that leverages cross-view semantic and\ngeometric reference from the source branch to yield aligned views from the\ntarget branch during the editing of 2D views. To validate the effectiveness of\nour method, we analyze 2D examples to demonstrate the improved consistency with\nthe VCAC module. Further extensive quantitative and qualitative results in\ntext-guided 3D scene editing indicate that our method achieves superior editing\nquality compared to state-of-the-art methods. We will make the complete\ncodebase publicly available following the conclusion of the double-blind review\nprocess.\n","authors":["Chaofan Luo","Donglin Di","Yongjia Ma","Zhou Xue","Chen Wei","Xun Yang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09101v2","updated":"2024-07-02T08:06:29Z","published":"2024-01-17T10:06:12Z","title":"PIN-SLAM: LiDAR SLAM Using a Point-Based Implicit Neural Representation\n for Achieving Global Map Consistency","summary":" Accurate and robust localization and mapping are essential components for\nmost autonomous robots. In this paper, we propose a SLAM system for building\nglobally consistent maps, called PIN-SLAM, that is based on an elastic and\ncompact point-based implicit neural map representation. Taking range\nmeasurements as input, our approach alternates between incremental learning of\nthe local implicit signed distance field and the pose estimation given the\ncurrent local map using a correspondence-free, point-to-implicit model\nregistration. Our implicit map is based on sparse optimizable neural points,\nwhich are inherently elastic and deformable with the global pose adjustment\nwhen closing a loop. Loops are also detected using the neural point features.\nExtensive experiments validate that PIN-SLAM is robust to various environments\nand versatile to different range sensors such as LiDAR and RGB-D cameras.\nPIN-SLAM achieves pose estimation accuracy better or on par with the\nstate-of-the-art LiDAR odometry or SLAM systems and outperforms the recent\nneural implicit SLAM approaches while maintaining a more consistent, and highly\ncompact implicit map that can be reconstructed as accurate and complete meshes.\nFinally, thanks to the voxel hashing for efficient neural points indexing and\nthe fast implicit map-based registration without closest point association,\nPIN-SLAM can run at the sensor frame rate on a moderate GPU. Codes will be\navailable at: https://github.com/PRBonn/PIN_SLAM.\n","authors":["Yue Pan","Xingguang Zhong","Louis Wiesmann","Thorbjörn Posewsky","Jens Behley","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2401.09101v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2407.01525v2","updated":"2024-07-02T07:37:56Z","published":"2024-07-01T17:59:35Z","title":"Empowering 3D Visual Grounding with Reasoning Capabilities","summary":" Although great progress has been made in 3D visual grounding, current models\nstill rely on explicit textual descriptions for grounding and lack the ability\nto reason human intentions from implicit instructions. We propose a new task\ncalled 3D reasoning grounding and introduce a new benchmark ScanReason which\nprovides over 10K question-answer-location pairs from five reasoning types that\nrequire the synerization of reasoning and grounding. We further design our\napproach, ReGround3D, composed of the visual-centric reasoning module empowered\nby Multi-modal Large Language Model (MLLM) and the 3D grounding module to\nobtain accurate object locations by looking back to the enhanced geometry and\nfine-grained details from the 3D scenes. A chain-of-grounding mechanism is\nproposed to further boost the performance with interleaved reasoning and\ngrounding steps during inference. Extensive experiments on the proposed\nbenchmark validate the effectiveness of our proposed approach.\n","authors":["Chenming Zhu","Tai Wang","Wenwei Zhang","Kai Chen","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2407.01525v2.pdf","comment":"Accepted by ECCV 2024. A comprehensive and hierarchical 3D reasoning\n grounding benchmark in the era of foundation models. Project page:\n https://zcmax.github.io/projects/ScanReason"},{"id":"http://arxiv.org/abs/2407.02014v1","updated":"2024-07-02T07:35:21Z","published":"2024-07-02T07:35:21Z","title":"Multi-Grained Contrast for Data-Efficient Unsupervised Representation\n Learning","summary":" The existing contrastive learning methods mainly focus on single-grained\nrepresentation learning, e.g., part-level, object-level or scene-level ones,\nthus inevitably neglecting the transferability of representations on other\ngranularity levels. In this paper, we aim to learn multi-grained\nrepresentations, which can effectively describe the image on various\ngranularity levels, thus improving generalization on extensive downstream\ntasks. To this end, we propose a novel Multi-Grained Contrast method (MGC) for\nunsupervised representation learning. Specifically, we construct delicate\nmulti-grained correspondences between positive views and then conduct\nmulti-grained contrast by the correspondences to learn more general\nunsupervised representations.\n Without pretrained on large-scale dataset, our method significantly\noutperforms the existing state-of-the-art methods on extensive downstream\ntasks, including object detection, instance segmentation, scene parsing,\nsemantic segmentation and keypoint detection. Moreover, experimental results\nsupport the data-efficient property and excellent representation\ntransferability of our method. The source code and trained weights are\navailable at \\url{https://github.com/visresearch/mgc}.\n","authors":["Chengchao Shen","Jianzhong Chen","Jianxin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10020v2","updated":"2024-07-02T07:29:04Z","published":"2024-05-16T12:02:02Z","title":"Natural Language Can Help Bridge the Sim2Real Gap","summary":" The main challenge in learning image-conditioned robotic policies is\nacquiring a visual representation conducive to low-level control. Due to the\nhigh dimensionality of the image space, learning a good visual representation\nrequires a considerable amount of visual data. However, when learning in the\nreal world, data is expensive. Sim2Real is a promising paradigm for overcoming\ndata scarcity in the real-world target domain by using a simulator to collect\nlarge amounts of cheap data closely related to the target task. However, it is\ndifficult to transfer an image-conditioned policy from sim to real when the\ndomains are very visually dissimilar. To bridge the sim2real visual gap, we\npropose using natural language descriptions of images as a unifying signal\nacross domains that captures the underlying task-relevant semantics. Our key\ninsight is that if two image observations from different domains are labeled\nwith similar language, the policy should predict similar action distributions\nfor both images. We demonstrate that training the image encoder to predict the\nlanguage description or the distance between descriptions of a sim or real\nimage serves as a useful, data-efficient pretraining step that helps learn a\ndomain-invariant image representation. We can then use this image encoder as\nthe backbone of an IL policy trained simultaneously on a large amount of\nsimulated and a handful of real demonstrations. Our approach outperforms widely\nused prior sim2real methods and strong vision-language pretraining baselines\nlike CLIP and R3M by 25 to 40%. See additional videos and materials at\nhttps://robin-lab.cs.utexas.edu/lang4sim2real/.\n","authors":["Albert Yu","Adeline Foote","Raymond Mooney","Roberto Martín-Martín"],"pdf_url":"https://arxiv.org/pdf/2405.10020v2.pdf","comment":"To appear in RSS 2024. Project website at\n https://robin-lab.cs.utexas.edu/lang4sim2real/"},{"id":"http://arxiv.org/abs/2407.02004v1","updated":"2024-07-02T07:22:28Z","published":"2024-07-02T07:22:28Z","title":"SAVE: Segment Audio-Visual Easy way using Segment Anything Model","summary":" The primary aim of Audio-Visual Segmentation (AVS) is to precisely identify\nand locate auditory elements within visual scenes by accurately predicting\nsegmentation masks at the pixel level. Achieving this involves comprehensively\nconsidering data and model aspects to address this task effectively. This study\npresents a lightweight approach, SAVE, which efficiently adapts the pre-trained\nsegment anything model (SAM) to the AVS task. By incorporating an image encoder\nadapter into the transformer blocks to better capture the distinct dataset\ninformation and proposing a residual audio encoder adapter to encode the audio\nfeatures as a sparse prompt, our proposed model achieves effective audio-visual\nfusion and interaction during the encoding stage. Our proposed method\naccelerates the training and inference speed by reducing the input resolution\nfrom 1024 to 256 pixels while achieving higher performance compared with the\nprevious SOTA. Extensive experimentation validates our approach, demonstrating\nthat our proposed model outperforms other SOTA methods significantly. Moreover,\nleveraging the pre-trained model on synthetic data enhances performance on real\nAVSBench data, achieving 84.59 mIoU on the S4 (V1S) subset and 70.28 mIoU on\nthe MS3 (V1M) set with only 256 pixels for input images. This increases up to\n86.16 mIoU on the S4 (V1S) and 70.83 mIoU on the MS3 (V1M) with inputs of 1024\npixels.\n","authors":["Khanh-Binh Nguyen","Chae Jung Park"],"pdf_url":"https://arxiv.org/pdf/2407.02004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01996v1","updated":"2024-07-02T07:10:10Z","published":"2024-07-02T07:10:10Z","title":"ViG-Bias: Visually Grounded Bias Discovery and Mitigation","summary":" The proliferation of machine learning models in critical decision making\nprocesses has underscored the need for bias discovery and mitigation\nstrategies. Identifying the reasons behind a biased system is not\nstraightforward, since in many occasions they are associated with hidden\nspurious correlations which are not easy to spot. Standard approaches rely on\nbias audits performed by analyzing model performance in pre-defined subgroups\nof data samples, usually characterized by common attributes like gender or\nethnicity when it comes to people, or other specific attributes defining\nsemantically coherent groups of images. However, it is not always possible to\nknow a-priori the specific attributes defining the failure modes of visual\nrecognition systems. Recent approaches propose to discover these groups by\nleveraging large vision language models, which enable the extraction of\ncross-modal embeddings and the generation of textual descriptions to\ncharacterize the subgroups where a certain model is underperforming. In this\nwork, we argue that incorporating visual explanations (e.g. heatmaps generated\nvia GradCAM or other approaches) can boost the performance of such bias\ndiscovery and mitigation frameworks. To this end, we introduce Visually\nGrounded Bias Discovery and Mitigation (ViG-Bias), a simple yet effective\ntechnique which can be integrated to a variety of existing frameworks to\nimprove both, discovery and mitigation performance. Our comprehensive\nevaluation shows that incorporating visual explanations enhances existing\ntechniques like DOMINO, FACTS and Bias-to-Text, across several challenging\ndatasets, including CelebA, Waterbirds, and NICO++.\n","authors":["Marani Badr-Eddine","Hanini Mohamed","Malayarukil Nihitha","Christodoulidis Stergios","Vakalopoulou Maria","Ferrante Enzo"],"pdf_url":"https://arxiv.org/pdf/2407.01996v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2405.05714v2","updated":"2024-07-02T07:06:15Z","published":"2024-05-08T12:13:40Z","title":"Estimating Noisy Class Posterior with Part-level Labels for Noisy Label\n Learning","summary":" In noisy label learning, estimating noisy class posteriors plays a\nfundamental role for developing consistent classifiers, as it forms the basis\nfor estimating clean class posteriors and the transition matrix. Existing\nmethods typically learn noisy class posteriors by training a classification\nmodel with noisy labels. However, when labels are incorrect, these models may\nbe misled to overemphasize the feature parts that do not reflect the instance\ncharacteristics, resulting in significant errors in estimating noisy class\nposteriors. To address this issue, this paper proposes to augment the\nsupervised information with part-level labels, encouraging the model to focus\non and integrate richer information from various parts. Specifically, our\nmethod first partitions features into distinct parts by cropping instances,\nyielding part-level labels associated with these various parts. Subsequently,\nwe introduce a novel single-to-multiple transition matrix to model the\nrelationship between the noisy and part-level labels, which incorporates\npart-level labels into a classifier-consistent framework. Utilizing this\nframework with part-level labels, we can learn the noisy class posteriors more\nprecisely by guiding the model to integrate information from various parts,\nultimately improving the classification performance. Our method is\ntheoretically sound, while experiments show that it is empirically effective in\nsynthetic and real-world noisy benchmarks.\n","authors":["Rui Zhao","Bin Shi","Jianfei Ruan","Tianze Pan","Bo Dong"],"pdf_url":"https://arxiv.org/pdf/2405.05714v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2407.01987v1","updated":"2024-07-02T06:59:19Z","published":"2024-07-02T06:59:19Z","title":"AHMsys: An Automated HVAC Modeling System for BIM Project","summary":" This paper presents a novel system, named AHMsys, designed to automate the\nprocess of generating 3D Heating, Ventilation, and Air Conditioning (HVAC)\nmodels from 2D Computer-Aided Design (CAD) drawings, a key component of\nBuilding Information Modeling (BIM). By automatically preprocessing and\nextracting essential HVAC object information then creating detailed 3D models,\nour proposed AHMsys significantly reduced the 20 percent work schedule of the\nBIM process in Akila. This advancement highlights the essential impact of\nintegrating AI technologies in managing the lifecycle of a digital\nrepresentation of the building.\n","authors":["Long Hoang Dang","Duy-Hung Nguyen","Thai Quang Le","Thinh Truong Nguyen","Clark Mei","Vu Hoang"],"pdf_url":"https://arxiv.org/pdf/2407.01987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01983v1","updated":"2024-07-02T06:41:39Z","published":"2024-07-02T06:41:39Z","title":"SADL: An Effective In-Context Learning Method for Compositional Visual\n QA","summary":" Large vision-language models (LVLMs) offer a novel capability for performing\nin-context learning (ICL) in Visual QA. When prompted with a few demonstrations\nof image-question-answer triplets, LVLMs have demonstrated the ability to\ndiscern underlying patterns and transfer this latent knowledge to answer new\nquestions about unseen images without the need for expensive supervised\nfine-tuning. However, designing effective vision-language prompts, especially\nfor compositional questions, remains poorly understood. Adapting language-only\nICL techniques may not necessarily work because we need to bridge the\nvisual-linguistic semantic gap: Symbolic concepts must be grounded in visual\ncontent, which does not share the syntactic linguistic structures. This paper\nintroduces SADL, a new visual-linguistic prompting framework for the task. SADL\nrevolves around three key components: SAmpling, Deliberation, and\nPseudo-Labeling of image-question pairs. Given an image-question query, we\nsample image-question pairs from the training data that are in semantic\nproximity to the query. To address the compositional nature of questions, the\ndeliberation step decomposes complex questions into a sequence of subquestions.\nFinally, the sequence is progressively annotated one subquestion at a time to\ngenerate a sequence of pseudo-labels. We investigate the behaviors of SADL\nunder OpenFlamingo on large-scale Visual QA datasets, namely GQA, GQA-OOD,\nCLEVR, and CRIC. The evaluation demonstrates the critical roles of sampling in\nthe neighborhood of the image, the decomposition of complex questions, and the\naccurate pairing of the subquestions and labels. These findings do not always\nalign with those found in language-only ICL, suggesting fresh insights in\nvision-language settings.\n","authors":["Long Hoang Dang","Thao Minh Le","Vuong Le","Tu Minh Phuong","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2407.01983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13608v3","updated":"2024-07-02T06:35:51Z","published":"2023-05-23T02:16:14Z","title":"VDD: Varied Drone Dataset for Semantic Segmentation","summary":" Semantic segmentation of drone images is critical for various aerial vision\ntasks as it provides essential semantic details to understand scenes on the\nground. Ensuring high accuracy of semantic segmentation models for drones\nrequires access to diverse, large-scale, and high-resolution datasets, which\nare often scarce in the field of aerial image processing. While existing\ndatasets typically focus on urban scenes and are relatively small, our Varied\nDrone Dataset (VDD) addresses these limitations by offering a large-scale,\ndensely labeled collection of 400 high-resolution images spanning 7 classes.\nThis dataset features various scenes in urban, industrial, rural, and natural\nareas, captured from different camera angles and under diverse lighting\nconditions. We also make new annotations to UDD and UAVid, integrating them\nunder VDD annotation standards, to create the Integrated Drone Dataset (IDD).\nWe train seven state-of-the-art models on drone datasets as baselines. It's\nexpected that our dataset will generate considerable interest in drone image\nsegmentation and serve as a foundation for other drone vision tasks. Datasets\nare publicly available at \\href{our website}{https://github.com/RussRobin/VDD}.\n","authors":["Wenxiao Cai","Ke Jin","Jinyan Hou","Cong Guo","Letian Wu","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2305.13608v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01003v2","updated":"2024-07-02T06:11:43Z","published":"2024-07-01T06:35:53Z","title":"Embedded Prompt Tuning: Towards Enhanced Calibration of Pretrained\n Models for Medical Images","summary":" Foundation models pre-trained on large-scale data have been widely witnessed\nto achieve success in various natural imaging downstream tasks.\nParameter-efficient fine-tuning (PEFT) methods aim to adapt foundation models\nto new domains by updating only a small portion of parameters in order to\nreduce computational overhead. However, the effectiveness of these PEFT\nmethods, especially in cross-domain few-shot scenarios, e.g., medical image\nanalysis, has not been fully explored. In this work, we facilitate the study of\nthe performance of PEFT when adapting foundation models to medical image\nclassification tasks. Furthermore, to alleviate the limitations of prompt\nintroducing ways and approximation capabilities on Transformer architectures of\nmainstream prompt tuning methods, we propose the Embedded Prompt Tuning (EPT)\nmethod by embedding prompt tokens into the expanded channels. We also find that\nthere are anomalies in the feature space distribution of foundation models\nduring pre-training process, and prompt tuning can help mitigate this negative\nimpact. To explain this phenomenon, we also introduce a novel perspective to\nunderstand prompt tuning: Prompt tuning is a distribution calibrator. And we\nsupport it by analyzing patch-wise scaling and feature separation operations\ncontained in EPT. Our experiments show that EPT outperforms several\nstate-of-the-art fine-tuning methods by a significant margin on few-shot\nmedical image classification tasks, and completes the fine-tuning process\nwithin highly competitive time, indicating EPT is an effective PEFT method. The\nsource code is available at github.com/zuwenqiang/EPT.\n","authors":["Wenqiang Zu","Shenghao Xie","Qing Zhao","Guoqi Li","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2407.01003v2.pdf","comment":"16 pages, 7 figures. arXiv admin note: text overlap with\n arXiv:2306.09579, arXiv:2203.12119 by other authors"},{"id":"http://arxiv.org/abs/2405.12752v2","updated":"2024-07-02T06:05:24Z","published":"2024-05-21T13:04:10Z","title":"C3L: Content Correlated Vision-Language Instruction Tuning Data\n Generation via Contrastive Learning","summary":" Vision-Language Instruction Tuning (VLIT) is a critical training phase for\nLarge Vision-Language Models (LVLMs). With the improving capabilities of\nopen-source LVLMs, researchers have increasingly turned to generate VLIT data\nby using open-source LVLMs and achieved significant progress. However, such\ndata generation approaches are bottlenecked by the following challenges: 1)\nSince multi-modal models tend to be influenced by prior language knowledge,\ndirectly using LVLMs to generate VLIT data would inevitably lead to low content\nrelevance between generated data and images. 2) To improve the ability of the\nmodels to generate VLIT data, previous methods have incorporated an additional\ntraining phase to boost the generative capacity. This process hurts the\ngeneralization of the models to unseen inputs (i.e., \"exposure bias\" problem).\nIn this paper, we propose a new Content Correlated VLIT data generation via\nContrastive Learning (C3L). Specifically, we design a new content relevance\nmodule which enhances the content relevance between VLIT data and images by\ncomputing Image Instruction Correspondence Scores S(I2C). Moreover, a\ncontrastive learning module is introduced to further boost the VLIT data\ngeneration capability of the LVLMs. A large number of automatic measures on\nfour benchmarks show the effectiveness of our method.\n","authors":["Ji Ma","Wei Suo","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.12752v2.pdf","comment":"Accepted by IJCAI-24"},{"id":"http://arxiv.org/abs/2407.01971v1","updated":"2024-07-02T06:02:05Z","published":"2024-07-02T06:02:05Z","title":"Pseudo-Labeling by Multi-Policy Viewfinder Network for Image Cropping","summary":" Automatic image cropping models predict reframing boxes to enhance image\naesthetics. Yet, the scarcity of labeled data hinders the progress of this\ntask. To overcome this limitation, we explore the possibility of utilizing both\nlabeled and unlabeled data together to expand the scale of training data for\nimage cropping models. This idea can be implemented in a pseudo-labeling way:\nproducing pseudo labels for unlabeled data by a teacher model and training a\nstudent model with these pseudo labels. However, the student may learn from\nteacher's mistakes. To address this issue, we propose the multi-policy\nviewfinder network (MPV-Net) that offers diverse refining policies to rectify\nthe mistakes in original pseudo labels from the teacher. The most reliable\npolicy is selected to generate trusted pseudo labels. The reliability of\npolicies is evaluated via the robustness against box jittering. The efficacy of\nour method can be evaluated by the improvement compared to the supervised\nbaseline which only uses labeled data. Notably, our MPV-Net outperforms\noff-the-shelf pseudo-labeling methods, yielding the most substantial\nimprovement over the supervised baseline. Furthermore, our approach achieves\nstate-of-the-art results on both the FCDB and FLMS datasets, signifying the\nsuperiority of our approach.\n","authors":["Zhiyu Pan","Kewei Wang","Yizheng Wu","Liwen Xiao","Jiahao Cui","Zhicheng Wang","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2407.01971v1.pdf","comment":"18 pages, 8figures"},{"id":"http://arxiv.org/abs/2407.01967v1","updated":"2024-07-02T05:51:04Z","published":"2024-07-02T05:51:04Z","title":"Unleash the Power of Local Representations for Few-Shot Classification","summary":" Generalizing to novel classes unseen during training is a key challenge of\nfew-shot classification. Recent metric-based methods try to address this by\nlocal representations. However, they are unable to take full advantage of them\ndue to (i) improper supervision for pretraining the feature extractor, and (ii)\nlack of adaptability in the metric for handling various possible compositions\nof local feature sets. In this work, we unleash the power of local\nrepresentations in improving novel-class generalization. For the feature\nextractor, we design a novel pretraining paradigm that learns randomly cropped\npatches by soft labels. It utilizes the class-level diversity of patches while\ndiminishing the impact of their semantic misalignments to hard labels. To align\nnetwork output with soft labels, we also propose a UniCon KL-Divergence that\nemphasizes the equal contribution of each base class in describing \"non-base\"\npatches. For the metric, we formulate measuring local feature sets as an\nentropy-regularized optimal transport problem to introduce the ability to\nhandle sets consisting of homogeneous elements. Furthermore, we design a\nModulate Module to endow the metric with the necessary adaptability. Our method\nachieves new state-of-the-art performance on three popular benchmarks.\nMoreover, it exceeds state-of-the-art transductive and cross-modal methods in\nthe fine-grained scenario.\n","authors":["Shi Tang","Guiming Luo","Xinchen Ye","Zhiyi Xia"],"pdf_url":"https://arxiv.org/pdf/2407.01967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01017v2","updated":"2024-07-02T05:47:02Z","published":"2024-07-01T07:05:44Z","title":"Coding for Intelligence from the Perspective of Category","summary":" Coding, which targets compressing and reconstructing data, and intelligence,\noften regarded at an abstract computational level as being centered around\nmodel learning and prediction, interweave recently to give birth to a series of\nsignificant progress. The recent trends demonstrate the potential homogeneity\nof these two fields, especially when deep-learning models aid these two\ncategories for better probability modeling. For better understanding and\ndescribing from a unified perspective, inspired by the basic generally\nrecognized principles in cognitive psychology, we formulate a novel problem of\nCoding for Intelligence from the category theory view. Based on the three\naxioms: existence of ideal coding, existence of practical coding, and\ncompactness promoting generalization, we derive a general framework to\nunderstand existing methodologies, namely that, coding captures the intrinsic\nrelationships of objects as much as possible, while ignoring information\nirrelevant to downstream tasks. This framework helps identify the challenges\nand essential elements in solving the specific derived Minimal Description\nLength (MDL) optimization problem from a broader range, providing opportunities\nto build a more intelligent system for handling multiple tasks/applications\nwith coding ideas/tools. Centering on those elements, we systematically review\nrecent processes of towards optimizing the MDL problem in more comprehensive\nways from data, model, and task perspectives, and reveal their impacts on the\npotential CfI technical routes. After that, we also present new technique paths\nto fulfill CfI and provide potential solutions with preliminary experimental\nevidence. Last, further directions and remaining issues are discussed as well.\nThe discussion shows our theory can reveal many phenomena and insights about\nlarge foundation models, which mutually corroborate with recent practices in\nfeature learning.\n","authors":["Wenhan Yang","Zixuan Hu","Lilang Lin","Jiaying Liu","Ling-Yu Duan"],"pdf_url":"https://arxiv.org/pdf/2407.01017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01960v1","updated":"2024-07-02T05:31:59Z","published":"2024-07-02T05:31:59Z","title":"Zero-shot Video Restoration and Enhancement Using Pre-Trained Image\n Diffusion Model","summary":" Diffusion-based zero-shot image restoration and enhancement models have\nachieved great success in various image restoration and enhancement tasks\nwithout training. However, directly applying them to video restoration and\nenhancement results in severe temporal flickering artifacts. In this paper, we\npropose the first framework for zero-shot video restoration and enhancement\nbased on a pre-trained image diffusion model. By replacing the self-attention\nlayer with the proposed cross-previous-frame attention layer, the pre-trained\nimage diffusion model can take advantage of the temporal correlation between\nneighboring frames. We further propose temporal consistency guidance,\nspatial-temporal noise sharing, and an early stopping sampling strategy for\nbetter temporally consistent sampling. Our method is a plug-and-play module\nthat can be inserted into any diffusion-based zero-shot image restoration or\nenhancement methods to further improve their performance. Experimental results\ndemonstrate the superiority of our proposed method in producing temporally\nconsistent videos with better fidelity.\n","authors":["Cong Cao","Huanjing Yue","Xin Liu","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01960v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2407.01959v1","updated":"2024-07-02T05:31:34Z","published":"2024-07-02T05:31:34Z","title":"FlowTrack: Point-level Flow Network for 3D Single Object Tracking","summary":" 3D single object tracking (SOT) is a crucial task in fields of mobile\nrobotics and autonomous driving. Traditional motion-based approaches achieve\ntarget tracking by estimating the relative movement of target between two\nconsecutive frames. However, they usually overlook local motion information of\nthe target and fail to exploit historical frame information effectively. To\novercome the above limitations, we propose a point-level flow method with\nmulti-frame information for 3D SOT task, called FlowTrack. Specifically, by\nestimating the flow for each point in the target, our method could capture the\nlocal motion details of target, thereby improving the tracking performance. At\nthe same time, to handle scenes with sparse points, we present a learnable\ntarget feature as the bridge to efficiently integrate target information from\npast frames. Moreover, we design a novel Instance Flow Head to transform dense\npoint-level flow into instance-level motion, effectively aggregating local\nmotion information to obtain global target motion. Finally, our method achieves\ncompetitive performance with improvements of 5.9% on the KITTI dataset and 2.9%\non NuScenes. The code will be made publicly available soon.\n","authors":["Shuo Li","Yubo Cui","Zhiheng Li","Zheng Fang"],"pdf_url":"https://arxiv.org/pdf/2407.01959v1.pdf","comment":"Accepted by IROS2024"},{"id":"http://arxiv.org/abs/2403.07576v4","updated":"2024-07-02T05:28:03Z","published":"2024-03-12T12:05:43Z","title":"Fine-grained Prompt Tuning: A Parameter and Memory Efficient Transfer\n Learning Method for High-resolution Medical Image Classification","summary":" Parameter-efficient transfer learning (PETL) is proposed as a cost-effective\nway to transfer pre-trained models to downstream tasks, avoiding the high cost\nof updating entire large-scale pre-trained models (LPMs). In this work, we\npresent Fine-grained Prompt Tuning (FPT), a novel PETL method for medical image\nclassification. FPT significantly reduces memory consumption compared to other\nPETL methods, especially in high-resolution input contexts. To achieve this, we\nfirst freeze the weights of the LPM and construct a learnable lightweight side\nnetwork. The frozen LPM takes high-resolution images as input to extract\nfine-grained features, while the side network is fed low-resolution images to\nreduce memory usage. To allow the side network to access pre-trained knowledge,\nwe introduce fine-grained prompts that summarize information from the LPM\nthrough a fusion module. Important tokens selection and preloading techniques\nare employed to further reduce training cost and memory requirements. We\nevaluate FPT on four medical datasets with varying sizes, modalities, and\ncomplexities. Experimental results demonstrate that FPT achieves comparable\nperformance to fine-tuning the entire LPM while using only 1.8% of the\nlearnable parameters and 13% of the memory costs of an encoder ViT-B model with\na 512 x 512 input resolution.\n","authors":["Yijin Huang","Pujin Cheng","Roger Tam","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2403.07576v4.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2406.10943v2","updated":"2024-07-02T04:42:56Z","published":"2024-06-16T13:47:40Z","title":"Rectified Iterative Disparity for Stereo Matching","summary":" Both uncertainty-assisted and iteration-based methods have achieved great\nsuccess in stereo matching. However, existing uncertainty estimation methods\ntake a single image and the corresponding disparity as input, which imposes\nhigher demands on the estimation network. In this paper, we propose Cost\nvolume-based disparity Uncertainty Estimation (UEC). Based on the rich\nsimilarity information in the cost volume coming from the image pairs, the\nproposed UEC can achieve competitive performance with low computational cost.\nSecondly, we propose two methods of uncertainty-assisted disparity estimation,\nUncertainty-based Disparity Rectification (UDR) and Uncertainty-based Disparity\nupdate Conditioning (UDC). These two methods optimise the disparity update\nprocess of the iterative-based approach without adding extra parameters. In\naddition, we propose Disparity Rectification loss that significantly improves\nthe accuracy of small amount of disparity updates. We present a\nhigh-performance stereo architecture, DR Stereo, which is a combination of the\nproposed methods. Experimental results from SceneFlow, KITTI, Middlebury 2014,\nand ETH3D show that DR-Stereo achieves very competitive disparity estimation\nperformance.\n","authors":["Weiqing Xiao"],"pdf_url":"https://arxiv.org/pdf/2406.10943v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01945v1","updated":"2024-07-02T04:30:13Z","published":"2024-07-02T04:30:13Z","title":"Indoor 3D Reconstruction with an Unknown Camera-Projector Pair","summary":" Structured light-based method with a camera-projector pair (CPP) plays a\nvital role in indoor 3D reconstruction, especially for scenes with weak\ntextures. Previous methods usually assume known intrinsics, which are\npre-calibrated from known objects, or self-calibrated from multi-view\nobservations. It is still challenging to reliably recover CPP intrinsics from\nonly two views without any known objects. In this paper, we provide a simple\nyet reliable solution. We demonstrate that, for the first time, sufficient\nconstraints on CPP intrinsics can be derived from an unknown cuboid corner\n(C2), e.g. a room's corner, which is a common structure in indoor scenes. In\naddition, with only known camera principal point, the complex multi-variable\nestimation of all CPP intrinsics can be simplified to a simple univariable\noptimization problem, leading to reliable calibration and thus direct 3D\nreconstruction with unknown CPP. Extensive results have demonstrated the\nsuperiority of the proposed method over both traditional and learning-based\ncounterparts. Furthermore, the proposed method also demonstrates impressive\npotential to solve similar tasks without active lighting, such as sparse-view\nstructure from motion.\n","authors":["Zhaoshuai Qi","Yifeng Hao","Rui Hu","Wenyou Chang","Jiaqi Yang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01942v1","updated":"2024-07-02T04:23:54Z","published":"2024-07-02T04:23:54Z","title":"Certainly Uncertain: A Benchmark and Metric for Multimodal Epistemic and\n Aleatoric Awareness","summary":" The ability to acknowledge the inevitable uncertainty in their knowledge and\nreasoning is a prerequisite for AI systems to be truly truthful and reliable.\nIn this paper, we present a taxonomy of uncertainty specific to vision-language\nAI systems, distinguishing between epistemic uncertainty (arising from a lack\nof information) and aleatoric uncertainty (due to inherent unpredictability),\nand further explore finer categories within. Based on this taxonomy, we\nsynthesize a benchmark dataset, CertainlyUncertain, featuring 178K visual\nquestion answering (VQA) samples as contrastive pairs. This is achieved by 1)\ninpainting images to make previously answerable questions into unanswerable\nones; and 2) using image captions to prompt large language models for both\nanswerable and unanswerable questions. Additionally, we introduce a new metric\nconfidence-weighted accuracy, that is well correlated with both accuracy and\ncalibration error, to address the shortcomings of existing metrics.\n","authors":["Khyathi Raghavi Chandu","Linjie Li","Anas Awadalla","Ximing Lu","Jae Sung Park","Jack Hessel","Lijuan Wang","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2407.01942v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2407.01931v1","updated":"2024-07-02T03:56:20Z","published":"2024-07-02T03:56:20Z","title":"Probabilistic 3D Correspondence Prediction from Sparse Unsegmented\n Images","summary":" The study of physiology demonstrates that the form (shape)of anatomical\nstructures dictates their functions, and analyzing the form of anatomies plays\na crucial role in clinical research. Statistical shape modeling (SSM) is a\nwidely used tool for quantitative analysis of forms of anatomies, aiding in\ncharacterizing and identifying differences within a population of subjects.\nDespite its utility, the conventional SSM construction pipeline is often\ncomplex and time-consuming. Additionally, reliance on linearity assumptions\nfurther limits the model from capturing clinically relevant variations. Recent\nadvancements in deep learning solutions enable the direct inference of SSM from\nunsegmented medical images, streamlining the process and improving\naccessibility. However, the new methods of SSM from images do not adequately\naccount for situations where the imaging data quality is poor or where only\nsparse information is available. Moreover, quantifying aleatoric uncertainty,\nwhich represents inherent data variability, is crucial in deploying deep\nlearning for clinical tasks to ensure reliable model predictions and robust\ndecision-making, especially in challenging imaging conditions. Therefore, we\npropose SPI-CorrNet, a unified model that predicts 3D correspondences from\nsparse imaging data. It leverages a teacher network to regularize feature\nlearning and quantifies data-dependent aleatoric uncertainty by adapting the\nnetwork to predict intrinsic input variances. Experiments on the LGE MRI left\natrium dataset and Abdomen CT-1K liver datasets demonstrate that our technique\nenhances the accuracy and robustness of sparse image-driven SSM.\n","authors":["Krithika Iyer","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2407.01931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01930v1","updated":"2024-07-02T03:49:48Z","published":"2024-07-02T03:49:48Z","title":"Self-Cooperation Knowledge Distillation for Novel Class Discovery","summary":" Novel Class Discovery (NCD) aims to discover unknown and novel classes in an\nunlabeled set by leveraging knowledge already learned about known classes.\nExisting works focus on instance-level or class-level knowledge representation\nand build a shared representation space to achieve performance improvements.\nHowever, a long-neglected issue is the potential imbalanced number of samples\nfrom known and novel classes, pushing the model towards dominant classes.\nTherefore, these methods suffer from a challenging trade-off between reviewing\nknown classes and discovering novel classes. Based on this observation, we\npropose a Self-Cooperation Knowledge Distillation (SCKD) method to utilize each\ntraining sample (whether known or novel, labeled or unlabeled) for both review\nand discovery. Specifically, the model's feature representations of known and\nnovel classes are used to construct two disjoint representation spaces. Through\nspatial mutual information, we design a self-cooperation learning to encourage\nmodel learning from the two feature representation spaces from itself.\nExtensive experiments on six datasets demonstrate that our method can achieve\nsignificant performance improvements, achieving state-of-the-art performance.\n","authors":["Yuzheng Wang","Zhaoyu Chen","Dingkang Yang","Yunquan Sun","Lizhe Qi"],"pdf_url":"https://arxiv.org/pdf/2407.01930v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2407.01012v2","updated":"2024-07-02T03:48:06Z","published":"2024-07-01T06:52:34Z","title":"Swish-T : Enhancing Swish Activation with Tanh Bias for Improved Neural\n Network Performance","summary":" We propose the Swish-T family, an enhancement of the existing non-monotonic\nactivation function Swish. Swish-T is defined by adding a Tanh bias to the\noriginal Swish function. This modification creates a family of Swish-T\nvariants, each designed to excel in different tasks, showcasing specific\nadvantages depending on the application context. The Tanh bias allows for\nbroader acceptance of negative values during initial training stages, offering\na smoother non-monotonic curve than the original Swish. We ultimately propose\nthe Swish-T$_{\\textbf{C}}$ function, while Swish-T and Swish-T$_{\\textbf{B}}$,\nbyproducts of Swish-T$_{\\textbf{C}}$, also demonstrate satisfactory\nperformance. Furthermore, our ablation study shows that using\nSwish-T$_{\\textbf{C}}$ as a non-parametric function can still achieve high\nperformance. The superiority of the Swish-T family has been empirically\ndemonstrated across various models and benchmark datasets, including MNIST,\nFashion MNIST, SVHN, CIFAR-10, and CIFAR-100. The code is publicly available at\n\"https://github.com/ictseoyoungmin/Swish-T-pytorch\".\n","authors":["Youngmin Seo","Jinha Kim","Unsang Park"],"pdf_url":"https://arxiv.org/pdf/2407.01012v2.pdf","comment":"11 pages, 6 figures Revised the derivative of the sigmoid function\n from 1-sigmoid to sigmoid(1-sigmoid) for correctness.Updated related\n equations in Section 3.2 Conclusions to Conclusion in Section 6"},{"id":"http://arxiv.org/abs/2403.08003v2","updated":"2024-07-02T03:45:56Z","published":"2024-03-12T18:12:42Z","title":"Augmenting Efficient Real-time Surgical Instrument Segmentation in Video\n with Point Tracking and Segment Anything","summary":" The Segment Anything Model (SAM) is a powerful vision foundation model that\nis revolutionizing the traditional paradigm of segmentation. Despite this, a\nreliance on prompting each frame and large computational cost limit its usage\nin robotically assisted surgery. Applications, such as augmented reality\nguidance, require little user intervention along with efficient inference to be\nusable clinically. In this study, we address these limitations by adopting\nlightweight SAM variants to meet the efficiency requirement and employing\nfine-tuning techniques to enhance their generalization in surgical scenes.\nRecent advancements in Tracking Any Point (TAP) have shown promising results in\nboth accuracy and efficiency, particularly when points are occluded or leave\nthe field of view. Inspired by this progress, we present a novel framework that\ncombines an online point tracker with a lightweight SAM model that is\nfine-tuned for surgical instrument segmentation. Sparse points within the\nregion of interest are tracked and used to prompt SAM throughout the video\nsequence, providing temporal consistency. The quantitative results surpass the\nstate-of-the-art semi-supervised video object segmentation method XMem on the\nEndoVis 2015 dataset with 84.8 IoU and 91.0 Dice. Our method achieves promising\nperformance that is comparable to XMem and transformer-based fully supervised\nsegmentation methods on ex vivo UCL dVRK and in vivo CholecSeg8k datasets. In\naddition, the proposed method shows promising zero-shot generalization ability\non the label-free STIR dataset. In terms of efficiency, we tested our method on\na single GeForce RTX 4060/4090 GPU respectively, achieving an over 25/90 FPS\ninference speed. Code is available at:\nhttps://github.com/wuzijian1997/SIS-PT-SAM\n","authors":["Zijian Wu","Adam Schmidt","Peter Kazanzides","Septimiu E. Salcudean"],"pdf_url":"https://arxiv.org/pdf/2403.08003v2.pdf","comment":"6 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.01928v1","updated":"2024-07-02T03:45:09Z","published":"2024-07-02T03:45:09Z","title":"SymPoint Revolutionized: Boosting Panoptic Symbol Spotting with Layer\n Feature Enhancement","summary":" SymPoint is an initial attempt that utilizes point set representation to\nsolve the panoptic symbol spotting task on CAD drawing. Despite its\nconsiderable success, it overlooks graphical layer information and suffers from\nprohibitively slow training convergence. To tackle this issue, we introduce\nSymPoint-V2, a robust and efficient solution featuring novel, streamlined\ndesigns that overcome these limitations. In particular, we first propose a\nLayer Feature-Enhanced module (LFE) to encode the graphical layer information\ninto the primitive feature, which significantly boosts the performance. We also\ndesign a Position-Guided Training (PGT) method to make it easier to learn,\nwhich accelerates the convergence of the model in the early stages and further\npromotes performance. Extensive experiments show that our model achieves better\nperformance and faster convergence than its predecessor SymPoint on the public\nbenchmark. Our code and trained models are available at\nhttps://github.com/nicehuster/SymPointV2.\n","authors":["Wenlong Liu","Tianyu Yang","Qizhi Yu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01928v1.pdf","comment":"code at https://github.com/nicehuster/SymPointV2"},{"id":"http://arxiv.org/abs/2407.01926v1","updated":"2024-07-02T03:43:39Z","published":"2024-07-02T03:43:39Z","title":"Chemical Shift Encoding based Double Bonds Quantification in\n Triglycerides using Deep Image Prior","summary":" This study evaluated a deep learning-based method using Deep Image Prior\n(DIP) to quantify triglyceride double bonds from chemical-shift encoded\nmulti-echo gradient echo images without network training. We employed a cost\nfunction based on signal constraints to iteratively update the neural network\non a single dataset. The method was validated using phantom experiments and in\nvivo scans. Results showed close alignment between measured and reference\ndouble bond values, with phantom experiments yielding a Pearson correlation\ncoefficient of 0.96 (p = .0005). In vivo results demonstrated good agreement in\nsubcutaneous fat. We conclude that Deep Image Prior shows feasibility for\nquantifying double bonds and fatty acid content from chemical-shift encoded\nmulti-echo MRI.\n","authors":["Chaoxing Huang","Ziqiang Yu","Zijian Gao","Qiuyi Shen","Queenie Chan","Vincent Wai-Sun Wong","Winnie Chiu-Wing Chu","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2407.01926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01925v1","updated":"2024-07-02T03:42:32Z","published":"2024-07-02T03:42:32Z","title":"Looking From the Future: Multi-order Iterations Can Enhance Adversarial\n Attack Transferability","summary":" Various methods try to enhance adversarial transferability by improving the\ngeneralization from different perspectives. In this paper, we rethink the\noptimization process and propose a novel sequence optimization concept, which\nis named Looking From the Future (LFF). LFF makes use of the original\noptimization process to refine the very first local optimization choice.\nAdapting the LFF concept to the adversarial attack task, we further propose an\nLFF attack as well as an MLFF attack with better generalization ability.\nFurthermore, guiding with the LFF concept, we propose an $LLF^{\\mathcal{N}}$\nattack which entends the LFF attack to a multi-order attack, further enhancing\nthe transfer attack ability. All our proposed methods can be directly applied\nto the iteration-based attack methods. We evaluate our proposed method on the\nImageNet1k dataset by applying several SOTA adversarial attack methods under\nfour kinds of tasks. Experimental results show that our proposed method can\ngreatly enhance the attack transferability. Ablation experiments are also\napplied to verify the effectiveness of each component. The source code will be\nreleased after this paper is accepted.\n","authors":["Zijian Ying","Qianmu Li","Tao Wang","Zhichao Lian","Shunmei Meng","Xuyun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01921v1","updated":"2024-07-02T03:36:23Z","published":"2024-07-02T03:36:23Z","title":"GVDIFF: Grounded Text-to-Video Generation with Diffusion Models","summary":" In text-to-video (T2V) generation, significant attention has been directed\ntoward its development, yet unifying discrete and continuous grounding\nconditions in T2V generation remains under-explored. This paper proposes a\nGrounded text-to-Video generation framework, termed GVDIFF. First, we inject\nthe grounding condition into the self-attention through an uncertainty-based\nrepresentation to explicitly guide the focus of the network. Second, we\nintroduce a spatial-temporal grounding layer that connects the grounding\ncondition with target objects and enables the model with the grounded\ngeneration capacity in the spatial-temporal domain. Third, our dynamic gate\nnetwork adaptively skips the redundant grounding process to selectively extract\ngrounding information and semantics while improving efficiency. We extensively\nevaluate the grounded generation capacity of GVDIFF and demonstrate its\nversatility in applications, including long-range video generation, sequential\nprompts, and object-specific editing.\n","authors":["Huanzhang Dou","Ruixiang Li","Wei Su","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2407.01921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01920v1","updated":"2024-07-02T03:34:16Z","published":"2024-07-02T03:34:16Z","title":"To Forget or Not? Towards Practical Knowledge Unlearning for Large\n Language Models","summary":" Large Language Models (LLMs) trained on extensive corpora inevitably retain\nsensitive data, such as personal privacy information and copyrighted material.\nRecent advancements in knowledge unlearning involve updating LLM parameters to\nerase specific knowledge. However, current unlearning paradigms are mired in\nvague forgetting boundaries, often erasing knowledge indiscriminately. In this\nwork, we introduce KnowUnDo, a benchmark containing copyrighted content and\nuser privacy domains to evaluate if the unlearning process inadvertently erases\nessential knowledge. Our findings indicate that existing unlearning methods\noften suffer from excessive unlearning. To address this, we propose a simple\nyet effective method, MemFlex, which utilizes gradient information to precisely\ntarget and unlearn sensitive parameters. Experimental results show that MemFlex\nis superior to existing methods in both precise knowledge unlearning and\ngeneral knowledge retaining of LLMs. Code and dataset will be released at\nhttps://github.com/zjunlp/KnowUnDo.\n","authors":["Bozhong Tian","Xiaozhuan Liang","Siyuan Cheng","Qingbin Liu","Mengru Wang","Dianbo Sui","Xi Chen","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01920v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.01919v1","updated":"2024-07-02T03:33:42Z","published":"2024-07-02T03:33:42Z","title":"A Method to Facilitate Membership Inference Attacks in Deep Learning\n Models","summary":" Modern machine learning (ML) ecosystems offer a surging number of ML\nframeworks and code repositories that can greatly facilitate the development of\nML models. Today, even ordinary data holders who are not ML experts can apply\noff-the-shelf codebase to build high-performance ML models on their data, many\nof which are sensitive in nature (e.g., clinical records).\n In this work, we consider a malicious ML provider who supplies model-training\ncode to the data holders, does not have access to the training process, and has\nonly black-box query access to the resulting model. In this setting, we\ndemonstrate a new form of membership inference attack that is strictly more\npowerful than prior art. Our attack empowers the adversary to reliably\nde-identify all the training samples (average >99% attack TPR@0.1% FPR), and\nthe compromised models still maintain competitive performance as their\nuncorrupted counterparts (average <1% accuracy drop). Moreover, we show that\nthe poisoned models can effectively disguise the amplified membership leakage\nunder common membership privacy auditing, which can only be revealed by a set\nof secret samples known by the adversary.\n Overall, our study not only points to the worst-case membership privacy\nleakage, but also unveils a common pitfall underlying existing privacy auditing\nmethods, which calls for future efforts to rethink the current practice of\nauditing membership privacy in machine learning models.\n","authors":["Zitao Chen","Karthik Pattabiraman"],"pdf_url":"https://arxiv.org/pdf/2407.01919v1.pdf","comment":"NDSS'25 (a shorter version of this paper will appear in the\n conference proceeding)"},{"id":"http://arxiv.org/abs/2404.15274v2","updated":"2024-07-02T03:31:16Z","published":"2024-04-23T17:59:12Z","title":"Metric-guided Image Reconstruction Bounds via Conformal Prediction","summary":" Recent advancements in machine learning have led to the development of novel\nmedical imaging systems and algorithms that address ill-posed problems.\nAssessing their trustworthiness and understanding how to deploy them safely at\ntest time remains an important and open problem. In this work, we propose using\nconformal prediction to compute valid and distribution-free bounds on\ndownstream metrics given reconstructions generated by one algorithm, and\nretrieve upper/lower bounds and inlier/outlier reconstructions according to the\nadjusted bounds. Our work offers 1) test time image reconstruction evaluation\nwithout ground truth, 2) downstream performance guarantees, 3) meaningful\nupper/lower bound reconstructions, and 4) meaningful statistical\ninliers/outlier reconstructions. We demonstrate our method on post-mastectomy\nradiotherapy planning using 3D breast CT reconstructions, and show 1) that\nmetric-guided bounds have valid coverage for downstream metrics while\nconventional pixel-wise bounds do not and 2) anatomical differences of\nupper/lower bounds between metric-guided and pixel-wise methods. Our work paves\nway for more meaningful and trustworthy test-time evaluation of medical image\nreconstructions. Code available at\nhttps://github.com/matthewyccheung/conformal-metric\n","authors":["Matt Y Cheung","Tucker J Netherton","Laurence E Court","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2404.15274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05994v6","updated":"2024-07-02T03:24:09Z","published":"2023-01-15T00:55:40Z","title":"Min-Max-Jump distance and its applications","summary":" We explore three applications of Min-Max-Jump distance (MMJ distance).\nMMJ-based K-means revises K-means with MMJ distance. MMJ-based Silhouette\ncoefficient revises Silhouette coefficient with MMJ distance. We also tested\nthe Clustering with Neural Network and Index (CNNI) model with MMJ-based\nSilhouette coefficient. In the last application, we tested using Min-Max-Jump\ndistance for predicting labels of new points, after a clustering analysis of\ndata. Result shows Min-Max-Jump distance achieves good performances in all the\nthree proposed applications. In addition, we devise several algorithms for\ncalculating or estimating the distance.\n","authors":["Gangli Liu"],"pdf_url":"https://arxiv.org/pdf/2301.05994v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.02424v3","updated":"2024-07-02T03:13:56Z","published":"2022-06-06T08:34:52Z","title":"Slim-neck by GSConv: A lightweight-design for real-time detector\n architectures","summary":" Real-time object detection is significant for industrial and research fields.\nOn edge devices, a giant model is difficult to achieve the real-time detecting\nrequirement and a lightweight model built from a large number of the depth-wise\nseparable convolutional could not achieve the sufficient accuracy. We introduce\na new lightweight convolutional technique, GSConv, to lighten the model but\nmaintain the accuracy. The GSConv accomplishes an excellent trade-off between\nthe accuracy and speed. Furthermore, we provide a design suggestion based on\nthe GSConv, Slim-Neck (SNs), to achieve a higher computational\ncost-effectiveness of the real-time detectors. The effectiveness of the SNs was\nrobustly demonstrated in over twenty sets comparative experiments. In\nparticular, the real-time detectors of ameliorated by the SNs obtain the\nstate-of-the-art (70.9% AP50 for the SODA10M at a speed of ~ 100FPS on a Tesla\nT4) compared with the baselines. Code is available at\nhttps://github.com/alanli1997/slim-neck-by-gsconv\n","authors":["Hulin Li","Jun Li","Hanbing Wei","Zheng Liu","Zhenfei Zhan","Qiliang Ren"],"pdf_url":"https://arxiv.org/pdf/2206.02424v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01908v1","updated":"2024-07-02T03:13:34Z","published":"2024-07-02T03:13:34Z","title":"Efficient Stochastic Differential Equation for DEM Super Resolution with\n Void Filling","summary":" Digital Elevation Model (DEM) plays a fundamental role in remote sensing and\nphotogrammetry. Enhancing the quality of DEM is crucial for various\napplications. Although multiple types of defects may appear simultaneously in\nthe same DEM, they are commonly addressed separately. Most existing approaches\nonly aim to fill the DEM voids, or apply super-resolution to the intact DEM.\nThis paper introduces a unified generative model that simultaneously addresses\nvoids and low-resolution problems, rather than taking two separate measures.\nThe proposed approach presents the DEM Stochastic Differential Equation\n(DEM-SDE) for unified DEM quality enhancement. The DEM degradation of\ndownsampling and random voids adding is modeled as the SDE forwarding, and the\nrestoration is achieved by simulating the corresponding revert process.\nConditioned on the terrain feature, and adopting efficient submodules with\nlightweight channel attention, DEM-SDE simultaneously enhances the DEM quality\nwith an efficient process for training. The experiments show that DEM-SDE\nmethod achieves highly competitive performance in simultaneous super-resolution\nand void filling compared to the state-of-the-art work. DEM-SDE also manifests\nrobustness for larger DEM patches.\n","authors":["Tongtong Zhang","Zongcheng Zuo","Yuanxiang Li"],"pdf_url":"https://arxiv.org/pdf/2407.01908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01907v1","updated":"2024-07-02T03:13:27Z","published":"2024-07-02T03:13:27Z","title":"The Solution for the ICCV 2023 Perception Test Challenge 2023 -- Task 6\n -- Grounded videoQA","summary":" In this paper, we introduce a grounded video question-answering solution. Our\nresearch reveals that the fixed official baseline method for video question\nanswering involves two main steps: visual grounding and object tracking.\nHowever, a significant challenge emerges during the initial step, where\nselected frames may lack clearly identifiable target objects. Furthermore,\nsingle images cannot address questions like \"Track the container from which the\nperson pours the first time.\" To tackle this issue, we propose an alternative\ntwo-stage approach:(1) First, we leverage the VALOR model to answer questions\nbased on video information.(2) concatenate the answered questions with their\nrespective answers. Finally, we employ TubeDETR to generate bounding boxes for\nthe targets.\n","authors":["Hailiang Zhang","Dian Chao","Zhihao Guan","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01905v1","updated":"2024-07-02T03:09:40Z","published":"2024-07-02T03:09:40Z","title":"Enhancing Multi-Class Anomaly Detection via Diffusion Refinement with\n Dual Conditioning","summary":" Anomaly detection, the technique of identifying abnormal samples using only\nnormal samples, has attracted widespread interest in industry. Existing\none-model-per-category methods often struggle with limited generalization\ncapabilities due to their focus on a single category, and can fail when\nencountering variations in product. Recent feature reconstruction methods, as\nrepresentatives in one-model-all-categories schemes, face challenges including\nreconstructing anomalous samples and blurry reconstructions. In this paper, we\ncreatively combine a diffusion model and a transformer for multi-class anomaly\ndetection. This approach leverages diffusion to obtain high-frequency\ninformation for refinement, greatly alleviating the blurry reconstruction\nproblem while maintaining the sampling efficiency of the reverse diffusion\nprocess. The task is transformed into image inpainting to disconnect the\ninput-output correlation, thereby mitigating the \"identical shortcuts\" problem\nand avoiding the model from reconstructing anomalous samples. Besides, we\nintroduce category-awareness using dual conditioning to ensure the accuracy of\nprediction and reconstruction in the reverse diffusion process, preventing\nexcessive deviation from the target category, thus effectively enabling\nmulti-class anomaly detection. Futhermore, Spatio-temporal fusion is also\nemployed to fuse heatmaps predicted at different timesteps and scales,\nenhancing the performance of multi-class anomaly detection. Extensive\nexperiments on benchmark datasets demonstrate the superior performance and\nexceptional multi-class anomaly detection capabilities of our proposed method\ncompared to others.\n","authors":["Jiawei Zhan","Jinxiang Lai","Bin-Bin Gao","Jun Liu","Xiaochen Chen","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2407.01905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01903v1","updated":"2024-07-02T03:08:20Z","published":"2024-07-02T03:08:20Z","title":"Text-Aware Diffusion for Policy Learning","summary":" Training an agent to achieve particular goals or perform desired behaviors is\noften accomplished through reinforcement learning, especially in the absence of\nexpert demonstrations. However, supporting novel goals or behaviors through\nreinforcement learning requires the ad-hoc design of appropriate reward\nfunctions, which quickly becomes intractable. To address this challenge, we\npropose Text-Aware Diffusion for Policy Learning (TADPoLe), which uses a\npretrained, frozen text-conditioned diffusion model to compute dense zero-shot\nreward signals for text-aligned policy learning. We hypothesize that\nlarge-scale pretrained generative models encode rich priors that can supervise\na policy to behave not only in a text-aligned manner, but also in alignment\nwith a notion of naturalness summarized from internet-scale training data. In\nour experiments, we demonstrate that TADPoLe is able to learn policies for\nnovel goal-achievement and continuous locomotion behaviors specified by natural\nlanguage, in both Humanoid and Dog environments. The behaviors are learned\nzero-shot without ground-truth rewards or expert demonstrations, and are\nqualitatively more natural according to human evaluation. We further show that\nTADPoLe performs competitively when applied to robotic manipulation tasks in\nthe Meta-World environment.\n","authors":["Calvin Luo","Mandy He","Zilai Zeng","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10787v2","updated":"2024-07-02T02:38:45Z","published":"2024-06-16T03:00:16Z","title":"Evidential Uncertainty Sets in Deep Classifiers Using Conformal\n Prediction","summary":" In this paper, we propose Evidential Conformal Prediction (ECP) method for\nimage classifiers to generate the conformal prediction sets. Our method is\ndesigned based on a non-conformity score function that has its roots in\nEvidential Deep Learning (EDL) as a method of quantifying model (epistemic)\nuncertainty in DNN classifiers. We use evidence that are derived from the logit\nvalues of target labels to compute the components of our non-conformity score\nfunction: the heuristic notion of uncertainty in CP, uncertainty surprisal, and\nexpected utility. Our extensive experimental evaluation demonstrates that ECP\noutperforms three state-of-the-art methods for generating CP sets, in terms of\ntheir set sizes and adaptivity while maintaining the coverage of true labels.\n","authors":["Hamed Karimi","Reza Samavi"],"pdf_url":"https://arxiv.org/pdf/2406.10787v2.pdf","comment":"Accepted in 13th Symposium on Conformal and Probabilistic Prediction\n with Applications (COPA2024). To be published in the Proceedings of Machine\n Learning Research (PMLR), vol. 230, 2024 (24 Pages)"},{"id":"http://arxiv.org/abs/2406.18844v3","updated":"2024-07-02T02:36:01Z","published":"2024-06-27T02:31:03Z","title":"Revisiting Backdoor Attacks against Large Vision-Language Models","summary":" Instruction tuning enhances large vision-language models (LVLMs) but raises\nsecurity risks through potential backdoor attacks due to their openness.\nPrevious backdoor studies focus on enclosed scenarios with consistent training\nand testing instructions, neglecting the practical domain gaps that could\naffect attack effectiveness. This paper empirically examines the\ngeneralizability of backdoor attacks during the instruction tuning of LVLMs for\nthe first time, revealing certain limitations of most backdoor strategies in\npractical scenarios. We quantitatively evaluate the generalizability of six\ntypical backdoor attacks on image caption benchmarks across multiple LVLMs,\nconsidering both visual and textual domain offsets. Our findings indicate that\nattack generalizability is positively correlated with the backdoor trigger's\nirrelevance to specific images/models and the preferential correlation of the\ntrigger pattern. Additionally, we modify existing backdoor attacks based on the\nabove key observations, demonstrating significant improvements in cross-domain\nscenario generalizability (+86% attack success rate). Notably, even without\naccess to the instruction datasets, a multimodal instruction set can be\nsuccessfully poisoned with a very low poisoning rate (0.2%), achieving an\nattack success rate of over 97%. This paper underscores that even simple\ntraditional backdoor strategies pose a serious threat to LVLMs, necessitating\nmore attention and in-depth research.\n","authors":["Siyuan Liang","Jiawei Liang","Tianyu Pang","Chao Du","Aishan Liu","Ee-Chien Chang","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2406.18844v3.pdf","comment":"24 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.01894v1","updated":"2024-07-02T02:30:23Z","published":"2024-07-02T02:30:23Z","title":"Adaptive Modality Balanced Online Knowledge Distillation for\n Brain-Eye-Computer based Dim Object Detection","summary":" Advanced cognition can be extracted from the human brain using brain-computer\ninterfaces. Integrating these interfaces with computer vision techniques, which\npossess efficient feature extraction capabilities, can achieve more robust and\naccurate detection of dim targets in aerial images. However, existing target\ndetection methods primarily concentrate on homogeneous data, lacking efficient\nand versatile processing capabilities for heterogeneous multimodal data. In\nthis paper, we first build a brain-eye-computer based object detection system\nfor aerial images under few-shot conditions. This system detects suspicious\ntargets using region proposal networks, evokes the event-related potential\n(ERP) signal in electroencephalogram (EEG) through the eye-tracking-based slow\nserial visual presentation (ESSVP) paradigm, and constructs the EEG-image data\npairs with eye movement data. Then, an adaptive modality balanced online\nknowledge distillation (AMBOKD) method is proposed to recognize dim objects\nwith the EEG-image data. AMBOKD fuses EEG and image features using a multi-head\nattention module, establishing a new modality with comprehensive features. To\nenhance the performance and robust capability of the fusion modality,\nsimultaneous training and mutual learning between modalities are enabled by\nend-to-end online knowledge distillation. During the learning process, an\nadaptive modality balancing module is proposed to ensure multimodal equilibrium\nby dynamically adjusting the weights of the importance and the training\ngradients across various modalities. The effectiveness and superiority of our\nmethod are demonstrated by comparing it with existing state-of-the-art methods.\nAdditionally, experiments conducted on public datasets and system validations\nin real-world scenarios demonstrate the reliability and practicality of the\nproposed system and the designed method.\n","authors":["Zixing Li","Chao Yan","Zhen Lan","Dengqing Tang","Xiaojia Xiang","Han Zhou","Jun Lai"],"pdf_url":"https://arxiv.org/pdf/2407.01894v1.pdf","comment":"18 pages,15 figures"},{"id":"http://arxiv.org/abs/2209.01763v2","updated":"2024-07-02T02:26:09Z","published":"2022-09-05T04:52:12Z","title":"Uformer-ICS: A U-Shaped Transformer for Image Compressive Sensing\n Service","summary":" Many service computing applications require real-time dataset collection from\nmultiple devices, necessitating efficient sampling techniques to reduce\nbandwidth and storage pressure. Compressive sensing (CS) has found wide-ranging\napplications in image acquisition and reconstruction. Recently, numerous\ndeep-learning methods have been introduced for CS tasks. However, the accurate\nreconstruction of images from measurements remains a significant challenge,\nespecially at low sampling rates. In this paper, we propose Uformer-ICS as a\nnovel U-shaped transformer for image CS tasks by introducing inner\ncharacteristics of CS into transformer architecture. To utilize the uneven\nsparsity distribution of image blocks, we design an adaptive sampling\narchitecture that allocates measurement resources based on the estimated block\nsparsity, allowing the compressed results to retain maximum information from\nthe original image. Additionally, we introduce a multi-channel projection (MCP)\nmodule inspired by traditional CS optimization methods. By integrating the MCP\nmodule into the transformer blocks, we construct projection-based transformer\nblocks, and then form a symmetrical reconstruction model using these blocks and\nresidual convolutional blocks. Therefore, our reconstruction model can\nsimultaneously utilize the local features and long-range dependencies of image,\nand the prior projection knowledge of CS theory.\n Experimental results demonstrate its significantly better reconstruction\nperformance than state-of-the-art deep learning-based CS methods.\n","authors":["Kuiyuan Zhang","Zhongyun Hua","Yuanman Li","Yushu Zhang","Yicong Zhou"],"pdf_url":"https://arxiv.org/pdf/2209.01763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01888v1","updated":"2024-07-02T02:18:35Z","published":"2024-07-02T02:18:35Z","title":"PO-MSCKF: An Efficient Visual-Inertial Odometry by Reconstructing the\n Multi-State Constrained Kalman Filter with the Pose-only Theory","summary":" Efficient Visual-Inertial Odometry (VIO) is crucial for payload-constrained\nrobots. Though modern optimization-based algorithms have achieved superior\naccuracy, the MSCKF-based VIO algorithms are still widely demanded for their\nefficient and consistent performance. As MSCKF is built upon the conventional\nmulti-view geometry, the measured residuals are not only related to the state\nerrors but also related to the feature position errors. To apply EKF fusion, a\nprojection process is required to remove the feature position error from the\nobservation model, which can lead to model and accuracy degradation. To obtain\nan efficient visual-inertial fusion model, while also preserving the model\nconsistency, we propose to reconstruct the MSCKF VIO with the novel Pose-Only\n(PO) multi-view geometry description. In the newly constructed filter, we have\nmodeled PO reprojection residuals, which are solely related to the motion\nstates and thus overcome the requirements of space projection. Moreover, the\nnew filter does not require any feature position information, which removes the\ncomputational cost and linearization errors brought in by the 3D reconstruction\nprocedure. We have conducted comprehensive experiments on multiple datasets,\nwhere the proposed method has shown accuracy improvements and consistent\nperformance in challenging sequences.\n","authors":["Du Xueyu","Zhang Lilian","Liu Ruochen","Wang Maosong","Wu Wenqi","Mao Jun"],"pdf_url":"https://arxiv.org/pdf/2407.01888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09192v2","updated":"2024-07-02T02:18:21Z","published":"2024-03-14T09:06:49Z","title":"PYRA: Parallel Yielding Re-Activation for Training-Inference Efficient\n Task Adaptation","summary":" Recently, the scale of transformers has grown rapidly, which introduces\nconsiderable challenges in terms of training overhead and inference efficiency\nin the scope of task adaptation. Existing works, namely Parameter-Efficient\nFine-Tuning (PEFT) and model compression, have separately investigated the\nchallenges. However, PEFT cannot guarantee the inference efficiency of the\noriginal backbone, especially for large-scale models. Model compression\nrequires significant training costs for structure searching and re-training.\nConsequently, a simple combination of them cannot guarantee accomplishing both\ntraining efficiency and inference efficiency with minimal costs. In this paper,\nwe propose a novel Parallel Yielding Re-Activation (PYRA) method for such a\nchallenge of training-inference efficient task adaptation. PYRA first utilizes\nparallel yielding adaptive weights to comprehensively perceive the data\ndistribution in downstream tasks. A re-activation strategy for token modulation\nis then applied for tokens to be merged, leading to calibrated token features.\nExtensive experiments demonstrate that PYRA outperforms all competing methods\nunder both low compression rate and high compression rate, demonstrating its\neffectiveness and superiority in maintaining both training efficiency and\ninference efficiency for large-scale foundation models. Our code will be\nreleased to the public.\n","authors":["Yizhe Xiong","Hui Chen","Tianxiang Hao","Zijia Lin","Jungong Han","Yuesong Zhang","Guoxin Wang","Yongjun Bao","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2403.09192v2.pdf","comment":"15 pages, 5 figures, Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.01884v1","updated":"2024-07-02T02:11:15Z","published":"2024-07-02T02:11:15Z","title":"EIT-1M: One Million EEG-Image-Text Pairs for Human Visual-textual\n Recognition and More","summary":" Recently, electroencephalography (EEG) signals have been actively\nincorporated to decode brain activity to visual or textual stimuli and achieve\nobject recognition in multi-modal AI. Accordingly, endeavors have been focused\non building EEG-based datasets from visual or textual single-modal stimuli.\nHowever, these datasets offer limited EEG epochs per category, and the complex\nsemantics of stimuli presented to participants compromise their quality and\nfidelity in capturing precise brain activity. The study in neuroscience unveils\nthat the relationship between visual and textual stimulus in EEG recordings\nprovides valuable insights into the brain's ability to process and integrate\nmulti-modal information simultaneously. Inspired by this, we propose a novel\nlarge-scale multi-modal dataset, named EIT-1M, with over 1 million\nEEG-image-text pairs. Our dataset is superior in its capacity of reflecting\nbrain activities in simultaneously processing multi-modal information. To\nachieve this, we collected data pairs while participants viewed alternating\nsequences of visual-textual stimuli from 60K natural images and\ncategory-specific texts. Common semantic categories are also included to elicit\nbetter reactions from participants' brains. Meanwhile, response-based stimulus\ntiming and repetition across blocks and sessions are included to ensure data\ndiversity. To verify the effectiveness of EIT-1M, we provide an in-depth\nanalysis of EEG data captured from multi-modal stimuli across different\ncategories and participants, along with data quality scores for transparency.\nWe demonstrate its validity on two tasks: 1) EEG recognition from visual or\ntextual stimuli or both and 2) EEG-to-visual generation.\n","authors":["Xu Zheng","Ling Wang","Kanghao Chen","Yuanhuiyi Lyu","Jiazhou Zhou","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.01884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09411v2","updated":"2024-07-02T01:56:14Z","published":"2024-06-13T17:59:52Z","title":"MuirBench: A Comprehensive Benchmark for Robust Multi-image\n Understanding","summary":" We introduce MuirBench, a comprehensive benchmark that focuses on robust\nmulti-image understanding capabilities of multimodal LLMs. MuirBench consists\nof 12 diverse multi-image tasks (e.g., scene understanding, ordering) that\ninvolve 10 categories of multi-image relations (e.g., multiview, temporal\nrelations). Comprising 11,264 images and 2,600 multiple-choice questions,\nMuirBench is created in a pairwise manner, where each standard instance is\npaired with an unanswerable variant that has minimal semantic differences, in\norder for a reliable assessment. Evaluated upon 20 recent multi-modal LLMs, our\nresults reveal that even the best-performing models like GPT-4o and Gemini Pro\nfind it challenging to solve MuirBench, achieving 68.0% and 49.3% in accuracy.\nOpen-source multimodal LLMs trained on single images can hardly generalize to\nmulti-image questions, hovering below 33.3% in accuracy. These results\nhighlight the importance of MuirBench in encouraging the community to develop\nmultimodal LLMs that can look beyond a single image, suggesting potential\npathways for future improvements.\n","authors":["Fei Wang","Xingyu Fu","James Y. Huang","Zekun Li","Qin Liu","Xiaogeng Liu","Mingyu Derek Ma","Nan Xu","Wenxuan Zhou","Kai Zhang","Tianyi Lorena Yan","Wenjie Jacky Mo","Hsiang-Hui Liu","Pan Lu","Chunyuan Li","Chaowei Xiao","Kai-Wei Chang","Dan Roth","Sheng Zhang","Hoifung Poon","Muhao Chen"],"pdf_url":"https://arxiv.org/pdf/2406.09411v2.pdf","comment":"typos corrected, references added, Project Page:\n https://muirbench.github.io/"},{"id":"http://arxiv.org/abs/2401.12963v2","updated":"2024-07-02T01:52:26Z","published":"2024-01-23T18:45:54Z","title":"AutoRT: Embodied Foundation Models for Large Scale Orchestration of\n Robotic Agents","summary":" Foundation models that incorporate language, vision, and more recently\nactions have revolutionized the ability to harness internet scale data to\nreason about useful tasks. However, one of the key challenges of training\nembodied foundation models is the lack of data grounded in the physical world.\nIn this paper, we propose AutoRT, a system that leverages existing foundation\nmodels to scale up the deployment of operational robots in completely unseen\nscenarios with minimal human supervision. AutoRT leverages vision-language\nmodels (VLMs) for scene understanding and grounding, and further uses large\nlanguage models (LLMs) for proposing diverse and novel instructions to be\nperformed by a fleet of robots. Guiding data collection by tapping into the\nknowledge of foundation models enables AutoRT to effectively reason about\nautonomy tradeoffs and safety while significantly scaling up data collection\nfor robot learning. We demonstrate AutoRT proposing instructions to over 20\nrobots across multiple buildings and collecting 77k real robot episodes via\nboth teleoperation and autonomous robot policies. We experimentally show that\nsuch \"in-the-wild\" data collected by AutoRT is significantly more diverse, and\nthat AutoRT's use of LLMs allows for instruction following data collection\nrobots that can align to human preferences.\n","authors":["Michael Ahn","Debidatta Dwibedi","Chelsea Finn","Montse Gonzalez Arenas","Keerthana Gopalakrishnan","Karol Hausman","Brian Ichter","Alex Irpan","Nikhil Joshi","Ryan Julian","Sean Kirmani","Isabel Leal","Edward Lee","Sergey Levine","Yao Lu","Isabel Leal","Sharath Maddineni","Kanishka Rao","Dorsa Sadigh","Pannag Sanketi","Pierre Sermanet","Quan Vuong","Stefan Welker","Fei Xia","Ted Xiao","Peng Xu","Steve Xu","Zhuo Xu"],"pdf_url":"https://arxiv.org/pdf/2401.12963v2.pdf","comment":"26 pages, 9 figures, ICRA 2024 VLMNM Workshop"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.02485v1","updated":"2024-07-02T17:59:17Z","published":"2024-07-02T17:59:17Z","title":"RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in\n LLMs","summary":" Large language models (LLMs) typically utilize the top-k contexts from a\nretriever in retrieval-augmented generation (RAG). In this work, we propose a\nnovel instruction fine-tuning framework RankRAG, which instruction-tunes a\nsingle LLM for the dual purpose of context ranking and answer generation in\nRAG. In particular, the instruction-tuned LLMs work surprisingly well by adding\na small fraction of ranking data into the training blend, and outperform\nexisting expert ranking models, including the same LLM exclusively fine-tuned\non a large amount of ranking data. For generation, we compare our model with\nmany strong baselines, including GPT-4-0613, GPT-4-turbo-2024-0409, and\nChatQA-1.5, an open-sourced model with the state-of-the-art performance on RAG\nbenchmarks. Specifically, our Llama3-RankRAG significantly outperforms\nLlama3-ChatQA-1.5 and GPT-4 models on nine knowledge-intensive benchmarks. In\naddition, it also performs comparably to GPT-4 on five RAG benchmarks in the\nbiomedical domain without instruction fine-tuning on biomedical data,\ndemonstrating its superb capability for generalization to new domains.\n","authors":["Yue Yu","Wei Ping","Zihan Liu","Boxin Wang","Jiaxuan You","Chao Zhang","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2407.02485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02464v1","updated":"2024-07-02T17:44:00Z","published":"2024-07-02T17:44:00Z","title":"Reliable Confidence Intervals for Information Retrieval Evaluation Using\n Generative A.I","summary":" The traditional evaluation of information retrieval (IR) systems is generally\nvery costly as it requires manual relevance annotation from human experts.\nRecent advancements in generative artificial intelligence -- specifically large\nlanguage models (LLMs) -- can generate relevance annotations at an enormous\nscale with relatively small computational costs. Potentially, this could\nalleviate the costs traditionally associated with IR evaluation and make it\napplicable to numerous low-resource applications. However, generated relevance\nannotations are not immune to (systematic) errors, and as a result, directly\nusing them for evaluation produces unreliable results.\n In this work, we propose two methods based on prediction-powered inference\nand conformal risk control that utilize computer-generated relevance\nannotations to place reliable confidence intervals (CIs) around IR evaluation\nmetrics. Our proposed methods require a small number of reliable annotations\nfrom which the methods can statistically analyze the errors in the generated\nannotations. Using this information, we can place CIs around evaluation metrics\nwith strong theoretical guarantees. Unlike existing approaches, our conformal\nrisk control method is specifically designed for ranking metrics and can vary\nits CIs per query and document. Our experimental results show that our CIs\naccurately capture both the variance and bias in evaluation based on LLM\nannotations, better than the typical empirical bootstrapping estimates. We hope\nour contributions bring reliable evaluation to the many IR applications where\nthis was traditionally infeasible.\n","authors":["Harrie Oosterhuis","Rolf Jagerman","Zhen Qin","Xuanhui Wang","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2407.02464v1.pdf","comment":"KDD '24"},{"id":"http://arxiv.org/abs/2406.14325v2","updated":"2024-07-02T15:36:32Z","published":"2024-06-20T13:56:42Z","title":"Reproducibility in Machine Learning-based Research: Overview, Barriers\n and Drivers","summary":" Research in various fields is currently experiencing challenges regarding the\nreproducibility of results. This problem is also prevalent in machine learning\n(ML) research. The issue arises, for example, due to unpublished data and/or\nsource code and the sensitivity of ML training conditions. Although different\nsolutions have been proposed to address this issue, such as using ML platforms,\nthe level of reproducibility in ML-driven research remains unsatisfactory.\nTherefore, in this article, we discuss the reproducibility of ML-driven\nresearch with three main aims: (i) identifying the barriers to reproducibility\nwhen applying ML in research as well as categorize the barriers to different\ntypes of reproducibility (description, code, data, and experiment\nreproducibility), (ii) discussing potential drivers such as tools, practices,\nand interventions that support ML reproducibility, as well as distinguish\nbetween technology-driven drivers, procedural drivers, and drivers related to\nawareness and education, and (iii) mapping the drivers to the barriers. With\nthis work, we hope to provide insights and to contribute to the decision-making\nprocess regarding the adoption of different solutions to support ML\nreproducibility.\n","authors":["Harald Semmelrock","Tony Ross-Hellauer","Simone Kopeinik","Dieter Theiler","Armin Haberl","Stefan Thalmann","Dominik Kowald"],"pdf_url":"https://arxiv.org/pdf/2406.14325v2.pdf","comment":"Pre-print of submission for the AI Magazine - comments to this\n pre-print are very welcome"},{"id":"http://arxiv.org/abs/2401.10733v2","updated":"2024-07-02T15:14:29Z","published":"2024-01-19T14:50:22Z","title":"Dynamic Q&A of Clinical Documents with Large Language Models","summary":" Electronic health records (EHRs) house crucial patient data in clinical\nnotes. As these notes grow in volume and complexity, manual extraction becomes\nchallenging. This work introduces a natural language interface using large\nlanguage models (LLMs) for dynamic question-answering on clinical notes. Our\nchatbot, powered by Langchain and transformer-based LLMs, allows users to query\nin natural language, receiving relevant answers from clinical notes.\nExperiments, utilizing various embedding models and advanced LLMs, show Wizard\nVicuna's superior accuracy, albeit with high compute demands. Model\noptimization, including weight quantization, improves latency by approximately\n48 times. Promising results indicate potential, yet challenges such as model\nhallucinations and limited diverse medical case evaluations remain. Addressing\nthese gaps is crucial for unlocking the value in clinical notes and advancing\nAI-driven clinical decision-making.\n","authors":["Ran Elgedawy","Ioana Danciu","Maria Mahbub","Sudarshan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2401.10733v2.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.01449v2","updated":"2024-07-02T13:02:58Z","published":"2024-06-27T15:45:29Z","title":"ColPali: Efficient Document Retrieval with Vision Language Models","summary":" Documents are visually rich structures that convey information through text,\nas well as tables, figures, page layouts, or fonts. While modern document\nretrieval systems exhibit strong performance on query-to-text matching, they\nstruggle to exploit visual cues efficiently, hindering their performance on\npractical document retrieval applications such as Retrieval Augmented\nGeneration. To benchmark current systems on visually rich document retrieval,\nwe introduce the Visual Document Retrieval Benchmark ViDoRe, composed of\nvarious page-level retrieving tasks spanning multiple domains, languages, and\nsettings. The inherent shortcomings of modern systems motivate the introduction\nof a new retrieval model architecture, ColPali, which leverages the document\nunderstanding capabilities of recent Vision Language Models to produce\nhigh-quality contextualized embeddings solely from images of document pages.\nCombined with a late interaction matching mechanism, ColPali largely\noutperforms modern document retrieval pipelines while being drastically faster\nand end-to-end trainable.\n","authors":["Manuel Faysse","Hugues Sibille","Tony Wu","Bilel Omrani","Gautier Viaud","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2407.01449v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2308.01118v3","updated":"2024-07-02T12:39:41Z","published":"2023-08-02T12:58:11Z","title":"A Survey on Popularity Bias in Recommender Systems","summary":" Recommender systems help people find relevant content in a personalized way.\nOne main promise of such systems is that they are able to increase the\nvisibility of items in the long tail, i.e., the lesser-known items in a\ncatalogue. Existing research, however, suggests that in many situations todays\nrecommendation algorithms instead exhibit a popularity bias, meaning that they\noften focus on rather popular items in their recommendations. Such a bias may\nnot only lead to the limited value of the recommendations for consumers and\nproviders in the short run, but it may also cause undesired reinforcement\neffects over time. In this paper, we discuss the potential reasons for\npopularity bias and review existing approaches to detect, quantify and mitigate\npopularity bias in recommender systems. Our survey, therefore, includes both an\noverview of the computational metrics used in the literature as well as a\nreview of the main technical approaches to reduce the bias. Furthermore, we\ncritically discuss todays literature, where we observe that the research is\nalmost entirely based on computational experiments and on certain assumptions\nregarding the practical effects of including long-tail items in the\nrecommendations.\n","authors":["Anastasiia Klimashevskaia","Dietmar Jannach","Mehdi Elahi","Christoph Trattner"],"pdf_url":"https://arxiv.org/pdf/2308.01118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16546v2","updated":"2024-07-02T12:23:37Z","published":"2024-05-26T12:30:20Z","title":"Cocktail: A Comprehensive Information Retrieval Benchmark with\n LLM-Generated Documents Integration","summary":" The proliferation of Large Language Models (LLMs) has led to an influx of\nAI-generated content (AIGC) on the internet, transforming the corpus of\nInformation Retrieval (IR) systems from solely human-written to a coexistence\nwith LLM-generated content. The impact of this surge in AIGC on IR systems\nremains an open question, with the primary challenge being the lack of a\ndedicated benchmark for researchers. In this paper, we introduce Cocktail, a\ncomprehensive benchmark tailored for evaluating IR models in this mixed-sourced\ndata landscape of the LLM era. Cocktail consists of 16 diverse datasets with\nmixed human-written and LLM-generated corpora across various text retrieval\ntasks and domains. Additionally, to avoid the potential bias from previously\nincluded dataset information in LLMs, we also introduce an up-to-date dataset,\nnamed NQ-UTD, with queries derived from recent events. Through conducting over\n1,000 experiments to assess state-of-the-art retrieval models against the\nbenchmarked datasets in Cocktail, we uncover a clear trade-off between ranking\nperformance and source bias in neural retrieval models, highlighting the\nnecessity for a balanced approach in designing future IR systems. We hope\nCocktail can serve as a foundational resource for IR research in the LLM era,\nwith all data and code publicly available at\n\\url{https://github.com/KID-22/Cocktail}.\n","authors":["Sunhao Dai","Weihao Liu","Yuqi Zhou","Liang Pang","Rongju Ruan","Gang Wang","Zhenhua Dong","Jun Xu","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.16546v2.pdf","comment":"Accepted by Findings of ACL 2024; Datasets Link:\n https://huggingface.co/IR-Cocktail"},{"id":"http://arxiv.org/abs/2407.00063v2","updated":"2024-07-02T11:17:45Z","published":"2024-06-17T07:07:42Z","title":"An Interpretable Alternative to Neural Representation Learning for\n Rating Prediction -- Transparent Latent Class Modeling of User Reviews","summary":" Nowadays, neural network (NN) and deep learning (DL) techniques are widely\nadopted in many applications, including recommender systems. Given the sparse\nand stochastic nature of collaborative filtering (CF) data, recent works have\ncritically analyzed the effective improvement of neural-based approaches\ncompared to simpler and often transparent algorithms for recommendation.\nPrevious results showed that NN and DL models can be outperformed by\ntraditional algorithms in many tasks. Moreover, given the largely black-box\nnature of neural-based methods, interpretable results are not naturally\nobtained. Following on this debate, we first present a transparent\nprobabilistic model that topologically organizes user and product latent\nclasses based on the review information. In contrast to popular neural\ntechniques for representation learning, we readily obtain a statistical,\nvisualization-friendly tool that can be easily inspected to understand user and\nproduct characteristics from a textual-based perspective. Then, given the\nlimitations of common embedding techniques, we investigate the possibility of\nusing the estimated interpretable quantities as model input for a rating\nprediction task. To contribute to the recent debates, we evaluate our results\nin terms of both capacity for interpretability and predictive performances in\ncomparison with popular text-based neural approaches. The results demonstrate\nthat the proposed latent class representations can yield competitive predictive\nperformances, compared to popular, but difficult-to-interpret approaches.\n","authors":["Giuseppe Serra","Peter Tino","Zhao Xu","Xin Yao"],"pdf_url":"https://arxiv.org/pdf/2407.00063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02156v1","updated":"2024-07-02T10:54:23Z","published":"2024-07-02T10:54:23Z","title":"Towards Training Music Taggers on Synthetic Data","summary":" Most contemporary music tagging systems rely on large volumes of annotated\ndata. As an alternative, we investigate the extent to which synthetically\ngenerated music excerpts can improve tagging systems when only small annotated\ncollections are available. To this end, we release GTZAN-synth, a synthetic\ndataset that follows the taxonomy of the well-known GTZAN dataset while being\nten times larger in data volume. We first observe that simply adding this\nsynthetic dataset to the training split of GTZAN does not result into\nperformance improvements. We then proceed to investigating domain adaptation,\ntransfer learning and fine-tuning strategies for the task at hand and draw the\nconclusion that the last two options yield an increase in accuracy. Overall,\nthe proposed approach can be considered as a first guide in a promising field\nfor future research.\n","authors":["Nadine Kroher","Steven Manangu","Aggelos Pikrakis"],"pdf_url":"https://arxiv.org/pdf/2407.02156v1.pdf","comment":"6 pages, 3 figures, accepted to 21st International Conference on\n Content-based Multimedia Indexing (CBMI) 2024, code available\n https://github.com/NadineKroher/music-tagging-synthetic-data-cbmi-2024"},{"id":"http://arxiv.org/abs/2407.02104v1","updated":"2024-07-02T09:43:47Z","published":"2024-07-02T09:43:47Z","title":"Joint-Dataset Learning and Cross-Consistent Regularization for\n Text-to-Motion Retrieval","summary":" Pose-estimation methods enable extracting human motion from common videos in\nthe structured form of 3D skeleton sequences. Despite great application\nopportunities, effective content-based access to such spatio-temporal motion\ndata is a challenging problem. In this paper, we focus on the recently\nintroduced text-motion retrieval tasks, which aim to search for database\nmotions that are the most relevant to a specified natural-language textual\ndescription (text-to-motion) and vice-versa (motion-to-text). Despite recent\nefforts to explore these promising avenues, a primary challenge remains the\ninsufficient data available to train robust text-motion models effectively. To\naddress this issue, we propose to investigate joint-dataset learning - where we\ntrain on multiple text-motion datasets simultaneously - together with the\nintroduction of a Cross-Consistent Contrastive Loss function (CCCL), which\nregularizes the learned text-motion common space by imposing uni-modal\nconstraints that augment the representation ability of the trained network. To\nlearn a proper motion representation, we also introduce a transformer-based\nmotion encoder, called MoT++, which employs spatio-temporal attention to\nprocess sequences of skeleton data. We demonstrate the benefits of the proposed\napproaches on the widely-used KIT Motion-Language and HumanML3D datasets. We\nperform detailed experimentation on joint-dataset learning and cross-dataset\nscenarios, showing the effectiveness of each introduced module in a carefully\nconducted ablation study and, in turn, pointing out the limitations of\nstate-of-the-art methods.\n","authors":["Nicola Messina","Jan Sedmidubsky","Fabrizio Falchi","Tomáš Rebok"],"pdf_url":"https://arxiv.org/pdf/2407.02104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17745v2","updated":"2024-07-02T08:05:55Z","published":"2024-06-25T17:31:04Z","title":"Light-weight End-to-End Graph Interest Network for CTR Prediction in\n E-commerce Search","summary":" Click-through-rate (CTR) prediction has an essential impact on improving user\nexperience and revenue in e-commerce search. With the development of deep\nlearning, graph-based methods are well exploited to utilize graph structure\nextracted from user behaviors and other information to help embedding learning.\nHowever, most of the previous graph-based methods mainly focus on\nrecommendation scenarios, and therefore their graph structures highly depend on\nitem's sequential information from user behaviors, ignoring query's sequential\nsignal and query-item correlation. In this paper, we propose a new approach\nnamed Light-weight End-to-End Graph Interest Network (EGIN) to effectively mine\nusers' search interests and tackle previous challenges. (i) EGIN utilizes query\nand item's correlation and sequential information from the search system to\nbuild a heterogeneous graph for better CTR prediction in e-commerce search.\n(ii) EGIN's graph embedding learning shares the same training input and is\njointly trained with CTR prediction, making the end-to-end framework effortless\nto deploy in large-scale search systems. The proposed EGIN is composed of three\nparts: query-item heterogeneous graph, light-weight graph sampling, and\nmulti-interest network. The query-item heterogeneous graph captures correlation\nand sequential information of query and item efficiently by the proposed\nlight-weight graph sampling. The multi-interest network is well designed to\nutilize graph embedding to capture various similarity relationships between\nquery and item to enhance the final CTR prediction. We conduct extensive\nexperiments on both public and industrial datasets to demonstrate the\neffectiveness of the proposed EGIN. At the same time, the training cost of\ngraph learning is relatively low compared with the main CTR prediction task,\nensuring efficiency in practical applications.\n","authors":["Pai Peng","Yunqing Jia","Ziqiang Zhou","Shuang Hong","Zichong Xiao"],"pdf_url":"https://arxiv.org/pdf/2406.17745v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.02028v1","updated":"2024-07-02T07:52:30Z","published":"2024-07-02T07:52:30Z","title":"Why does in-context learning fail sometimes? Evaluating in-context\n learning on open and closed questions","summary":" We measure the performance of in-context learning as a function of task\nnovelty and difficulty for open and closed questions. For that purpose, we\ncreated a novel benchmark consisting of hard scientific questions, each paired\nwith a context of various relevancy. We show that counter-intuitively, a\ncontext that is more aligned with the topic does not always help more than a\nless relevant context. This effect is especially visible for open questions and\nquestions of high difficulty or novelty. This result reveals a fundamental\ndifference between the treatment of close-form and open-form questions by\nlarge-language models and shows a need for a more robust evaluation of\nin-context learning on the variety of different types of questions. It also\nposes a new question of how to optimally select a context for large language\nmodels, especially in the context of Retrieval Augmented Generation (RAG)\nsystems. Our results suggest that the answer to this question can be highly\napplication-dependent and might be contingent on factors including the format\nof the question, the perceived difficulty level of the questions, and the\nnovelty or popularity of the information we seek.\n","authors":["Xiang Li","Haoran Tang","Siyu Chen","Ziwei Wang","Ryan Chen","Marcin Abram"],"pdf_url":"https://arxiv.org/pdf/2407.02028v1.pdf","comment":"8 pages plus references, 4 main figures, 6 pages of supplementary\n material"},{"id":"http://arxiv.org/abs/2407.01994v1","updated":"2024-07-02T07:07:59Z","published":"2024-07-02T07:07:59Z","title":"Simple Augmentations of Logical Rules for Neuro-Symbolic Knowledge Graph\n Completion","summary":" High-quality and high-coverage rule sets are imperative to the success of\nNeuro-Symbolic Knowledge Graph Completion (NS-KGC) models, because they form\nthe basis of all symbolic inferences. Recent literature builds neural models\nfor generating rule sets, however, preliminary experiments show that they\nstruggle with maintaining high coverage. In this work, we suggest three simple\naugmentations to existing rule sets: (1) transforming rules to their abductive\nforms, (2) generating equivalent rules that use inverse forms of constituent\nrelations and (3) random walks that propose new rules. Finally, we prune\npotentially low quality rules. Experiments over four datasets and five\nruleset-baseline settings suggest that these simple augmentations consistently\nimprove results, and obtain up to 7.1 pt MRR and 8.5 pt Hits@1 gains over using\nrules without augmentations.\n","authors":["Ananjan Nandi","Navdeep Kaur","Parag Singla"," Mausam"],"pdf_url":"https://arxiv.org/pdf/2407.01994v1.pdf","comment":"12 pages, 15 tables Published in ACL 2023"},{"id":"http://arxiv.org/abs/2407.01972v1","updated":"2024-07-02T06:08:55Z","published":"2024-07-02T06:08:55Z","title":"MeMemo: On-device Retrieval Augmentation for Private and Personalized\n Text Generation","summary":" Retrieval-augmented text generation (RAG) addresses the common limitations of\nlarge language models (LLMs), such as hallucination, by retrieving information\nfrom an updatable external knowledge base. However, existing approaches often\nrequire dedicated backend servers for data storage and retrieval, thereby\nlimiting their applicability in use cases that require strict data privacy,\nsuch as personal finance, education, and medicine. To address the pressing need\nfor client-side dense retrieval, we introduce MeMemo, the first open-source\nJavaScript toolkit that adapts the state-of-the-art approximate nearest\nneighbor search technique HNSW to browser environments. Developed with modern\nand native Web technologies, such as IndexedDB and Web Workers, our toolkit\nleverages client-side hardware capabilities to enable researchers and\ndevelopers to efficiently search through millions of high-dimensional vectors\nin the browser. MeMemo enables exciting new design and research opportunities,\nsuch as private and personalized content creation and interactive prototyping,\nas demonstrated in our example application RAG Playground. Reflecting on our\nwork, we discuss the opportunities and challenges for on-device dense\nretrieval. MeMemo is available at https://github.com/poloclub/mememo.\n","authors":["Zijie J. Wang","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2407.01972v1.pdf","comment":"Accepted to SIGIR 2024. 6 pages, 2 figures. For a live demo, visit\n https://poloclub.github.io/mememo/. Code is open-source at\n https://github.com/poloclub/mememo"},{"id":"http://arxiv.org/abs/2407.01965v1","updated":"2024-07-02T05:50:16Z","published":"2024-07-02T05:50:16Z","title":"AdaCQR: Enhancing Query Reformulation for Conversational Search via\n Sparse and Dense Retrieval Alignment","summary":" Conversational Query Reformulation (CQR) has significantly advanced in\naddressing the challenges of conversational search, particularly those stemming\nfrom the latent user intent and the need for historical context. Recent works\naimed to boost the performance of CRQ through alignment. However, they are\ndesigned for one specific retrieval system, which potentially results in poor\ngeneralization. To overcome this limitation, we present a novel framework\nAdaCQR. By aligning reformulation models with both term-based and\nsemantic-based retrieval systems, AdaCQR enhances the generalizability of\ninformation-seeking queries across diverse retrieval environments through a\ndual-phase training strategy. We also developed two effective approaches for\nacquiring superior labels and diverse input candidates, boosting the efficiency\nand robustness of the framework. Experimental evaluations on the TopiOCQA and\nQReCC datasets demonstrate that AdaCQR significantly outperforms existing\nmethods, offering both quantitative and qualitative improvements in\nconversational query reformulation.\n","authors":["Yilong Lai","Jialong Wu","Congzhi Zhang","Haowen Sun","Deyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.01965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01896v1","updated":"2024-07-02T02:39:33Z","published":"2024-07-02T02:39:33Z","title":"LogEval: A Comprehensive Benchmark Suite for Large Language Models In\n Log Analysis","summary":" Log analysis is crucial for ensuring the orderly and stable operation of\ninformation systems, particularly in the field of Artificial Intelligence for\nIT Operations (AIOps). Large Language Models (LLMs) have demonstrated\nsignificant potential in natural language processing tasks. In the AIOps\ndomain, they excel in tasks such as anomaly detection, root cause analysis of\nfaults, operations and maintenance script generation, and alert information\nsummarization. However, the performance of current LLMs in log analysis tasks\nremains inadequately validated. To address this gap, we introduce LogEval, a\ncomprehensive benchmark suite designed to evaluate the capabilities of LLMs in\nvarious log analysis tasks for the first time. This benchmark covers tasks such\nas log parsing, log anomaly detection, log fault diagnosis, and log\nsummarization. LogEval evaluates each task using 4,000 publicly available log\ndata entries and employs 15 different prompts for each task to ensure a\nthorough and fair assessment. By rigorously evaluating leading LLMs, we\ndemonstrate the impact of various LLM technologies on log analysis performance,\nfocusing on aspects such as self-consistency and few-shot contextual learning.\nWe also discuss findings related to model quantification, Chinese-English\nquestion-answering evaluation, and prompt engineering. These findings provide\ninsights into the strengths and weaknesses of LLMs in multilingual environments\nand the effectiveness of different prompt strategies. Various evaluation\nmethods are employed for different tasks to accurately measure the performance\nof LLMs in log analysis, ensuring a comprehensive assessment. The insights\ngained from LogEvals evaluation reveal the strengths and limitations of LLMs in\nlog analysis tasks, providing valuable guidance for researchers and\npractitioners.\n","authors":["Tianyu Cui","Shiyu Ma","Ziang Chen","Tong Xiao","Shimin Tao","Yilun Liu","Shenglin Zhang","Duoming Lin","Changchang Liu","Yuzhe Cai","Weibin Meng","Yongqian Sun","Dan Pei"],"pdf_url":"https://arxiv.org/pdf/2407.01896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00077v2","updated":"2024-07-02T02:06:17Z","published":"2024-06-22T15:32:53Z","title":"Differentially Private Graph Diffusion with Applications in Personalized\n PageRanks","summary":" Graph diffusion, which iteratively propagates real-valued substances among\nthe graph, is used in numerous graph/network-involved applications. However,\nreleasing diffusion vectors may reveal sensitive linking information in the\ndata such as transaction information in financial network data. However,\nprotecting the privacy of graph data is challenging due to its interconnected\nnature. This work proposes a novel graph diffusion framework with edge-level\ndifferential privacy guarantees by using noisy diffusion iterates. The\nalgorithm injects Laplace noise per diffusion iteration and adopts a\ndegree-based thresholding function to mitigate the high sensitivity induced by\nlow-degree nodes. Our privacy loss analysis is based on Privacy Amplification\nby Iteration (PABI), which to our best knowledge, is the first effort that\nanalyzes PABI with Laplace noise and provides relevant applications. We also\nintroduce a novel Infinity-Wasserstein distance tracking method, which tightens\nthe analysis of privacy leakage and makes PABI more applicable in practice. We\nevaluate this framework by applying it to Personalized Pagerank computation for\nranking tasks. Experiments on real-world network data demonstrate the\nsuperiority of our method under stringent privacy conditions.\n","authors":["Rongzhe Wei","Eli Chien","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2407.00077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02542v1","updated":"2024-07-02T07:02:39Z","published":"2024-07-02T07:02:39Z","title":"ECAT: A Entire space Continual and Adaptive Transfer Learning Framework\n for Cross-Domain Recommendation","summary":" In industrial recommendation systems, there are several mini-apps designed to\nmeet the diverse interests and needs of users. The sample space of them is\nmerely a small subset of the entire space, making it challenging to train an\nefficient model. In recent years, there have been many excellent studies\nrelated to cross-domain recommendation aimed at mitigating the problem of data\nsparsity. However, few of them have simultaneously considered the adaptability\nof both sample and representation continual transfer setting to the target\ntask. To overcome the above issue, we propose a Entire space Continual and\nAdaptive Transfer learning framework called ECAT which includes two core\ncomponents: First, as for sample transfer, we propose a two-stage method that\nrealizes a coarse-to-fine process. Specifically, we perform an initial\nselection through a graph-guided method, followed by a fine-grained selection\nusing domain adaptation method. Second, we propose an adaptive knowledge\ndistillation method for continually transferring the representations from a\nmodel that is well-trained on the entire space dataset. ECAT enables full\nutilization of the entire space samples and representations under the\nsupervision of the target task, while avoiding negative migration.\nComprehensive experiments on real-world industrial datasets from Taobao show\nthat ECAT advances state-of-the-art performance on offline metrics, and brings\n+13.6% CVR and +8.6% orders for Baiyibutie, a famous mini-app of Taobao.\n","authors":["Chaoqun Hou","Yuanhang Zhou","Yi Cao","Tong Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02542v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.02490v1","updated":"2024-07-02T17:59:56Z","published":"2024-07-02T17:59:56Z","title":"MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via\n Dynamic Sparse Attention","summary":" The computational challenges of Large Language Model (LLM) inference remain a\nsignificant barrier to their widespread deployment, especially as prompt\nlengths continue to increase. Due to the quadratic complexity of the attention\ncomputation, it takes 30 minutes for an 8B LLM to process a prompt of 1M tokens\n(i.e., the pre-filling stage) on a single A100 GPU. Existing methods for\nspeeding up prefilling often fail to maintain acceptable accuracy or efficiency\nwhen applied to long-context LLMs. To address this gap, we introduce MInference\n(Milliontokens Inference), a sparse calculation method designed to accelerate\npre-filling of long-sequence processing. Specifically, we identify three unique\npatterns in long-context attention matrices-the A-shape, Vertical-Slash, and\nBlock-Sparsethat can be leveraged for efficient sparse computation on GPUs. We\ndetermine the optimal pattern for each attention head offline and dynamically\nbuild sparse indices based on the assigned pattern during inference. With the\npattern and sparse indices, we perform efficient sparse attention calculations\nvia our optimized GPU kernels to significantly reduce the latency in the\npre-filling stage of long-context LLMs. Our proposed technique can be directly\napplied to existing LLMs without any modifications to the pre-training setup or\nadditional fine-tuning. By evaluating on a wide range of downstream tasks,\nincluding InfiniteBench, RULER, PG-19, and Needle In A Haystack, and models\nincluding LLaMA-3-1M, GLM4-1M, Yi-200K, Phi-3-128K, and Qwen2-128K, we\ndemonstrate that MInference effectively reduces inference latency by up to 10x\nfor pre-filling on an A100, while maintaining accuracy. Our code is available\nat https://aka.ms/MInference.\n","authors":["Huiqiang Jiang","Yucheng Li","Chengruidong Zhang","Qianhui Wu","Xufang Luo","Surin Ahn","Zhenhua Han","Amir H. Abdi","Dongsheng Li","Chin-Yew Lin","Yuqing Yang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2407.02490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02489v1","updated":"2024-07-02T17:59:50Z","published":"2024-07-02T17:59:50Z","title":"Magic Insert: Style-Aware Drag-and-Drop","summary":" We present Magic Insert, a method for dragging-and-dropping subjects from a\nuser-provided image into a target image of a different style in a physically\nplausible manner while matching the style of the target image. This work\nformalizes the problem of style-aware drag-and-drop and presents a method for\ntackling it by addressing two sub-problems: style-aware personalization and\nrealistic object insertion in stylized images. For style-aware personalization,\nour method first fine-tunes a pretrained text-to-image diffusion model using\nLoRA and learned text tokens on the subject image, and then infuses it with a\nCLIP representation of the target style. For object insertion, we use\nBootstrapped Domain Adaption to adapt a domain-specific photorealistic object\ninsertion model to the domain of diverse artistic styles. Overall, the method\nsignificantly outperforms traditional approaches such as inpainting. Finally,\nwe present a dataset, SubjectPlop, to facilitate evaluation and future progress\nin this area. Project page: https://magicinsert.github.io/\n","authors":["Nataniel Ruiz","Yuanzhen Li","Neal Wadhwa","Yael Pritch","Michael Rubinstein","David E. Jacobs","Shlomi Fruchter"],"pdf_url":"https://arxiv.org/pdf/2407.02489v1.pdf","comment":"Project page: https://magicinsert.github.io/"},{"id":"http://arxiv.org/abs/2407.02486v1","updated":"2024-07-02T17:59:29Z","published":"2024-07-02T17:59:29Z","title":"Neurocache: Efficient Vector Retrieval for Long-range Language Modeling","summary":" This paper introduces Neurocache, an approach to extend the effective context\nsize of large language models (LLMs) using an external vector cache to store\nits past states. Like recent vector retrieval approaches, Neurocache uses an\nefficient k-nearest-neighbor (kNN) algorithm to retrieve relevant past states\nand incorporate them into the attention process. Neurocache improves upon\nprevious methods by (1) storing compressed states, which reduces cache size;\n(2) performing a single retrieval operation per token which increases inference\nspeed; and (3) extending the retrieval window to neighboring states, which\nimproves both language modeling and downstream task accuracy. Our experiments\nshow the effectiveness of Neurocache both for models trained from scratch and\nfor pre-trained models such as Llama2-7B and Mistral-7B when enhanced with the\ncache mechanism. We also compare Neurocache with text retrieval methods and\nshow improvements in single-document question-answering and few-shot learning\ntasks. We made the source code available under:\nhttps://github.com/alisafaya/neurocache\n","authors":["Ali Safaya","Deniz Yuret"],"pdf_url":"https://arxiv.org/pdf/2407.02486v1.pdf","comment":"Long paper, published at the main conference NAACL'24"},{"id":"http://arxiv.org/abs/2407.02485v1","updated":"2024-07-02T17:59:17Z","published":"2024-07-02T17:59:17Z","title":"RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in\n LLMs","summary":" Large language models (LLMs) typically utilize the top-k contexts from a\nretriever in retrieval-augmented generation (RAG). In this work, we propose a\nnovel instruction fine-tuning framework RankRAG, which instruction-tunes a\nsingle LLM for the dual purpose of context ranking and answer generation in\nRAG. In particular, the instruction-tuned LLMs work surprisingly well by adding\na small fraction of ranking data into the training blend, and outperform\nexisting expert ranking models, including the same LLM exclusively fine-tuned\non a large amount of ranking data. For generation, we compare our model with\nmany strong baselines, including GPT-4-0613, GPT-4-turbo-2024-0409, and\nChatQA-1.5, an open-sourced model with the state-of-the-art performance on RAG\nbenchmarks. Specifically, our Llama3-RankRAG significantly outperforms\nLlama3-ChatQA-1.5 and GPT-4 models on nine knowledge-intensive benchmarks. In\naddition, it also performs comparably to GPT-4 on five RAG benchmarks in the\nbiomedical domain without instruction fine-tuning on biomedical data,\ndemonstrating its superb capability for generalization to new domains.\n","authors":["Yue Yu","Wei Ping","Zihan Liu","Boxin Wang","Jiaxuan You","Chao Zhang","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2407.02485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02476v1","updated":"2024-07-02T17:53:56Z","published":"2024-07-02T17:53:56Z","title":"Scalable Multi-Output Gaussian Processes with Stochastic Variational\n Inference","summary":" The Multi-Output Gaussian Process is is a popular tool for modelling data\nfrom multiple sources. A typical choice to build a covariance function for a\nMOGP is the Linear Model of Coregionalization (LMC) which parametrically models\nthe covariance between outputs. The Latent Variable MOGP (LV-MOGP) generalises\nthis idea by modelling the covariance between outputs using a kernel applied to\nlatent variables, one per output, leading to a flexible MOGP model that allows\nefficient generalization to new outputs with few data points. Computational\ncomplexity in LV-MOGP grows linearly with the number of outputs, which makes it\nunsuitable for problems with a large number of outputs. In this paper, we\npropose a stochastic variational inference approach for the LV-MOGP that allows\nmini-batches for both inputs and outputs, making computational complexity per\ntraining iteration independent of the number of outputs.\n","authors":["Xiaoyu Jiang","Sokratia Georgaka","Magnus Rattray","Mauricio A. Alvarez"],"pdf_url":"https://arxiv.org/pdf/2407.02476v1.pdf","comment":"none"},{"id":"http://arxiv.org/abs/2406.14794v2","updated":"2024-07-02T17:53:43Z","published":"2024-06-20T23:51:32Z","title":"ImageFlowNet: Forecasting Multiscale Trajectories of Disease Progression\n with Irregularly-Sampled Longitudinal Medical Images","summary":" The forecasting of disease progression from images is a holy grail for\nclinical decision making. However, this task is complicated by the inherent\nhigh dimensionality, temporal sparsity and sampling irregularity in\nlongitudinal image acquisitions. Existing methods often rely on extracting\nhand-crafted features and performing time-series analysis in this vector space,\nleading to a loss of rich spatial information within the images. To overcome\nthese challenges, we introduce ImageFlowNet, a novel framework that learns\nlatent-space flow fields that evolve multiscale representations in joint\nembedding spaces using neural ODEs and SDEs to model disease progression in the\nimage domain. Notably, ImageFlowNet learns multiscale joint representation\nspaces by combining cohorts of patients together so that information can be\ntransferred between the patient samples. The dynamics then provide plausible\ntrajectories of progression, with the SDE providing alternative trajectories\nfrom the same starting point. We provide theoretical insights that support our\nformulation of ODEs, and motivate our regularizations involving high-level\nvisual features, latent space organization, and trajectory smoothness. We then\ndemonstrate ImageFlowNet's effectiveness through empirical evaluations on three\nlongitudinal medical image datasets depicting progression in retinal geographic\natrophy, multiple sclerosis, and glioblastoma.\n","authors":["Chen Liu","Ke Xu","Liangbo L. Shen","Guillaume Huguet","Zilong Wang","Alexander Tong","Danilo Bzdok","Jay Stewart","Jay C. Wang","Lucian V. Del Priore","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2406.14794v2.pdf","comment":"Included reference to codebase. Added acknowledgements"},{"id":"http://arxiv.org/abs/2407.02466v1","updated":"2024-07-02T17:47:03Z","published":"2024-07-02T17:47:03Z","title":"PWM: Policy Learning with Large World Models","summary":" Reinforcement Learning (RL) has achieved impressive results on complex tasks\nbut struggles in multi-task settings with different embodiments. World models\noffer scalability by learning a simulation of the environment, yet they often\nrely on inefficient gradient-free optimization methods. We introduce Policy\nlearning with large World Models (PWM), a novel model-based RL algorithm that\nlearns continuous control policies from large multi-task world models. By\npre-training the world model on offline data and using it for first-order\ngradient policy learning, PWM effectively solves tasks with up to 152 action\ndimensions and outperforms methods using ground-truth dynamics. Additionally,\nPWM scales to an 80-task setting, achieving up to 27% higher rewards than\nexisting baselines without the need for expensive online planning.\nVisualizations and code available at https://policy-world-model.github.io\n","authors":["Ignat Georgiev","Varun Giridhar","Nicklas Hansen","Animesh Garg"],"pdf_url":"https://arxiv.org/pdf/2407.02466v1.pdf","comment":"Visualizations and code available at\n https://policy-world-model.github.io"},{"id":"http://arxiv.org/abs/2407.02461v1","updated":"2024-07-02T17:40:06Z","published":"2024-07-02T17:40:06Z","title":"Decentralized Intelligence Network (DIN)","summary":" Decentralized Intelligence Network (DIN) addresses the significant challenges\nof data sovereignty and AI utilization caused by the fragmentation and siloing\nof data across providers and institutions. This comprehensive framework\novercomes access barriers to scalable data sources previously hindered by silos\nby leveraging: 1) personal data stores as a prerequisite for data sovereignty;\n2) a scalable federated learning protocol implemented on a public blockchain\nfor decentralized AI training, where data remains with participants and only\nmodel parameter updates are shared; and 3) a scalable, trustless rewards\nmechanism to incentivize participation and ensure fair reward distribution.\nThis framework ensures that no entity can prevent or control access to training\non data offered by participants or determine financial benefits, as these\nprocesses operate on a public blockchain with an immutable record and without a\nthird party. It supports effective AI training, allowing participants to\nmaintain control over their data, benefit financially, and contribute to a\ndecentralized, scalable ecosystem that leverages collective AI to develop\nbeneficial algorithms.\n","authors":["Abraham Nash"],"pdf_url":"https://arxiv.org/pdf/2407.02461v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2402.05137v2","updated":"2024-07-02T17:38:18Z","published":"2024-02-06T19:00:00Z","title":"LtU-ILI: An All-in-One Framework for Implicit Inference in Astrophysics\n and Cosmology","summary":" This paper presents the Learning the Universe Implicit Likelihood Inference\n(LtU-ILI) pipeline, a codebase for rapid, user-friendly, and cutting-edge\nmachine learning (ML) inference in astrophysics and cosmology. The pipeline\nincludes software for implementing various neural architectures, training\nschemata, priors, and density estimators in a manner easily adaptable to any\nresearch workflow. It includes comprehensive validation metrics to assess\nposterior estimate coverage, enhancing the reliability of inferred results.\nAdditionally, the pipeline is easily parallelizable and is designed for\nefficient exploration of modeling hyperparameters. To demonstrate its\ncapabilities, we present real applications across a range of astrophysics and\ncosmology problems, such as: estimating galaxy cluster masses from X-ray\nphotometry; inferring cosmology from matter power spectra and halo point\nclouds; characterizing progenitors in gravitational wave signals; capturing\nphysical dust parameters from galaxy colors and luminosities; and establishing\nproperties of semi-analytic models of galaxy formation. We also include\nexhaustive benchmarking and comparisons of all implemented methods as well as\ndiscussions about the challenges and pitfalls of ML inference in astronomical\nsciences. All code and examples are made publicly available at\nhttps://github.com/maho3/ltu-ili.\n","authors":["Matthew Ho","Deaglan J. Bartlett","Nicolas Chartier","Carolina Cuesta-Lazaro","Simon Ding","Axel Lapel","Pablo Lemos","Christopher C. Lovell","T. Lucas Makinen","Chirag Modi","Viraj Pandya","Shivam Pandey","Lucia A. Perez","Benjamin Wandelt","Greg L. Bryan"],"pdf_url":"https://arxiv.org/pdf/2402.05137v2.pdf","comment":"22 pages, 10 figures, accepted in the Open Journal of Astrophysics.\n Code available at https://github.com/maho3/ltu-ili"},{"id":"http://arxiv.org/abs/2402.17570v3","updated":"2024-07-02T17:25:19Z","published":"2024-02-27T15:08:57Z","title":"Sparse Variational Contaminated Noise Gaussian Process Regression with\n Applications in Geomagnetic Perturbations Forecasting","summary":" Gaussian Processes (GP) have become popular machine-learning methods for\nkernel-based learning on datasets with complicated covariance structures. In\nthis paper, we present a novel extension to the GP framework using a\ncontaminated normal likelihood function to better account for heteroscedastic\nvariance and outlier noise. We propose a scalable inference algorithm based on\nthe Sparse Variational Gaussian Process (SVGP) method for fitting sparse\nGaussian process regression models with contaminated normal noise on large\ndatasets. We examine an application to geomagnetic ground perturbations, where\nthe state-of-the-art prediction model is based on neural networks. We show that\nour approach yields shorter prediction intervals for similar coverage and\naccuracy when compared to an artificial dense neural network baseline.\n","authors":["Daniel Iong","Matthew McAnear","Yuezhou Qu","Shasha Zou","Gabor Toth","Yang Chen"],"pdf_url":"https://arxiv.org/pdf/2402.17570v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02447v1","updated":"2024-07-02T17:24:04Z","published":"2024-07-02T17:24:04Z","title":"PLeaS -- Merging Models with Permutations and Least Squares","summary":" The democratization of machine learning systems has made the process of\nfine-tuning accessible to a large number of practitioners, leading to a wide\nrange of open-source models fine-tuned on specialized tasks and datasets.\nRecent work has proposed to merge such models to combine their functionalities.\nHowever, prior approaches are restricted to models that are fine-tuned from the\nsame base model. Furthermore, the final merged model is typically restricted to\nbe of the same size as the original models. In this work, we propose a new\ntwo-step algorithm to merge models-termed PLeaS-which relaxes these\nconstraints. First, leveraging the Permutation symmetries inherent in the two\nmodels, PLeaS partially matches nodes in each layer by maximizing alignment.\nNext, PLeaS computes the weights of the merged model as a layer-wise Least\nSquares solution to minimize the approximation error between the features of\nthe merged model and the permuted features of the original models. into a\nsingle model of a desired size, even when the two original models are\nfine-tuned from different base models. We also present a variant of our method\nwhich can merge models without using data from the fine-tuning domains. We\ndemonstrate our method to merge ResNet models trained with shared and different\nlabel spaces, and show that we can perform better than the state-of-the-art\nmerging methods by 8 to 15 percentage points for the same target compute while\nmerging models trained on DomainNet and on fine-grained classification tasks.\n","authors":["Anshul Nasery","Jonathan Hayase","Pang Wei Koh","Sewoong Oh"],"pdf_url":"https://arxiv.org/pdf/2407.02447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02437v1","updated":"2024-07-02T17:15:12Z","published":"2024-07-02T17:15:12Z","title":"Parameter Matching Attack: Enhancing Practical Applicability of\n Availability Attacks","summary":" The widespread use of personal data for training machine learning models\nraises significant privacy concerns, as individuals have limited control over\nhow their public data is subsequently utilized. Availability attacks have\nemerged as a means for data owners to safeguard their data by desning\nimperceptible perturbations that degrade model performance when incorporated\ninto training datasets. However, existing availability attacks exhibit\nlimitations in practical applicability, particularly when only a portion of the\ndata can be perturbed. To address this challenge, we propose a novel\navailability attack approach termed Parameter Matching Attack (PMA). PMA is the\nfirst availability attack that works when only a portion of data can be\nperturbed. PMA optimizes perturbations so that when the model is trained on a\nmixture of clean and perturbed data, the resulting model will approach a model\ndesigned to perform poorly. Experimental results across four datasets\ndemonstrate that PMA outperforms existing methods, achieving significant model\nperformance degradation when a part of the training data is perturbed. Our code\nis available in the supplementary.\n","authors":["Yu Zhe","Jun Sakuma"],"pdf_url":"https://arxiv.org/pdf/2407.02437v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2407.02432v1","updated":"2024-07-02T17:09:24Z","published":"2024-07-02T17:09:24Z","title":"Evaluating the Robustness of Adverse Drug Event Classification Models\n Using Templates","summary":" An adverse drug effect (ADE) is any harmful event resulting from medical drug\ntreatment. Despite their importance, ADEs are often under-reported in official\nchannels. Some research has therefore turned to detecting discussions of ADEs\nin social media. Impressive results have been achieved in various attempts to\ndetect ADEs. In a high-stakes domain such as medicine, however, an in-depth\nevaluation of a model's abilities is crucial. We address the issue of thorough\nperformance evaluation in English-language ADE detection with hand-crafted\ntemplates for four capabilities: Temporal order, negation, sentiment, and\nbeneficial effect. We find that models with similar performance on held-out\ntest sets have varying results on these capabilities.\n","authors":["Dorothea MacPhail","David Harbecke","Lisa Raithel","Sebastian Möller"],"pdf_url":"https://arxiv.org/pdf/2407.02432v1.pdf","comment":"Accepted at BioNLP 2024 and Shared Tasks (ACL Workshop)"},{"id":"http://arxiv.org/abs/2407.02431v1","updated":"2024-07-02T17:08:38Z","published":"2024-07-02T17:08:38Z","title":"On the Robustness of Graph Reduction Against GNN Backdoor","summary":" Graph Neural Networks (GNNs) are gaining popularity across various domains\ndue to their effectiveness in learning graph-structured data. Nevertheless,\nthey have been shown to be susceptible to backdoor poisoning attacks, which\npose serious threats to real-world applications. Meanwhile, graph reduction\ntechniques, including coarsening and sparsification, which have long been\nemployed to improve the scalability of large graph computational tasks, have\nrecently emerged as effective methods for accelerating GNN training on\nlarge-scale graphs. However, the current development and deployment of graph\nreduction techniques for large graphs overlook the potential risks of data\npoisoning attacks against GNNs. It is not yet clear how graph reduction\ninteracts with existing backdoor attacks. This paper conducts a thorough\nexamination of the robustness of graph reduction methods in scalable GNN\ntraining in the presence of state-of-the-art backdoor attacks. We performed a\ncomprehensive robustness analysis across six coarsening methods and six\nsparsification methods for graph reduction, under three GNN backdoor attacks\nagainst three GNN architectures. Our findings indicate that the effectiveness\nof graph reduction methods in mitigating attack success rates varies\nsignificantly, with some methods even exacerbating the attacks. Through\ndetailed analyses of triggers and poisoned nodes, we interpret our findings and\nenhance our understanding of how graph reduction interacts with backdoor\nattacks. These results highlight the critical need for incorporating robustness\nconsiderations in graph reduction for GNN training, ensuring that enhancements\nin computational efficiency do not compromise the security of GNN systems.\n","authors":["Yuxuan Zhu","Michael Mandulak","Kerui Wu","George Slota","Yuseok Jeon","Ka-Ho Chow","Lei Yu"],"pdf_url":"https://arxiv.org/pdf/2407.02431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02430v1","updated":"2024-07-02T17:04:34Z","published":"2024-07-02T17:04:34Z","title":"Meta 3D TextureGen: Fast and Consistent Texture Generation for 3D\n Objects","summary":" The recent availability and adaptability of text-to-image models has sparked\na new era in many related domains that benefit from the learned text priors as\nwell as high-quality and fast generation capabilities, one of which is texture\ngeneration for 3D objects. Although recent texture generation methods achieve\nimpressive results by using text-to-image networks, the combination of global\nconsistency, quality, and speed, which is crucial for advancing texture\ngeneration to real-world applications, remains elusive. To that end, we\nintroduce Meta 3D TextureGen: a new feedforward method comprised of two\nsequential networks aimed at generating high-quality and globally consistent\ntextures for arbitrary geometries of any complexity degree in less than 20\nseconds. Our method achieves state-of-the-art results in quality and speed by\nconditioning a text-to-image model on 3D semantics in 2D space and fusing them\ninto a complete and high-resolution UV texture map, as demonstrated by\nextensive qualitative and quantitative evaluations. In addition, we introduce a\ntexture enhancement network that is capable of up-scaling any texture by an\narbitrary ratio, producing 4k pixel resolution textures.\n","authors":["Raphael Bensadoun","Yanir Kleiman","Idan Azuri","Omri Harosh","Andrea Vedaldi","Natalia Neverova","Oran Gafni"],"pdf_url":"https://arxiv.org/pdf/2407.02430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2007.03451v2","updated":"2024-07-02T17:02:59Z","published":"2020-07-07T13:57:59Z","title":"Analytics of Longitudinal System Monitoring Data for Performance\n Prediction","summary":" In recent years, several HPC facilities have started continuous monitoring of\ntheir systems and jobs to collect performance-related data for understanding\nperformance and operational efficiency. Such data can be used to optimize the\nperformance of individual jobs and the overall system by creating data-driven\nmodels that can predict the performance of jobs waiting in the scheduler queue.\nIn this paper, we model the performance of representative control jobs using\nlongitudinal system-wide monitoring data and machine learning to explore the\ncauses of performance variability. We analyze these prediction models in great\ndetail to identify the features that are dominant predictors of performance. We\ndemonstrate that such models can be application-agnostic and can be used for\npredicting performance of applications that are not included in training.\n","authors":["Ian J. Costello","Abhinav Bhatele"],"pdf_url":"https://arxiv.org/pdf/2007.03451v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09062v3","updated":"2024-07-02T17:00:46Z","published":"2024-05-15T03:26:01Z","title":"Naturalistic Music Decoding from EEG Data via Latent Diffusion Models","summary":" In this article, we explore the potential of using latent diffusion models, a\nfamily of powerful generative models, for the task of reconstructing\nnaturalistic music from electroencephalogram (EEG) recordings. Unlike simpler\nmusic with limited timbres, such as MIDI-generated tunes or monophonic pieces,\nthe focus here is on intricate music featuring a diverse array of instruments,\nvoices, and effects, rich in harmonics and timbre. This study represents an\ninitial foray into achieving general music reconstruction of high-quality using\nnon-invasive EEG data, employing an end-to-end training approach directly on\nraw data without the need for manual pre-processing and channel selection. We\ntrain our models on the public NMED-T dataset and perform quantitative\nevaluation proposing neural embedding-based metrics. We additionally perform\nsong classification based on the generated tracks. Our work contributes to the\nongoing research in neural decoding and brain-computer interfaces, offering\ninsights into the feasibility of using EEG data for complex auditory\ninformation reconstruction.\n","authors":["Emilian Postolache","Natalia Polouliakh","Hiroaki Kitano","Akima Connelly","Emanuele Rodolà","Luca Cosmo","Taketo Akama"],"pdf_url":"https://arxiv.org/pdf/2405.09062v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10958v2","updated":"2024-07-02T16:57:06Z","published":"2023-10-17T03:11:30Z","title":"Enhancing Deep Neural Network Training Efficiency and Performance\n through Linear Prediction","summary":" Deep neural networks (DNN) have achieved remarkable success in various\nfields, including computer vision and natural language processing. However,\ntraining an effective DNN model still poses challenges. This paper aims to\npropose a method to optimize the training effectiveness of DNN, with the goal\nof improving model performance. Firstly, based on the observation that the DNN\nparameters change in certain laws during training process, the potential of\nparameter prediction for improving model training efficiency and performance is\ndiscovered. Secondly, considering the magnitude of DNN model parameters,\nhardware limitations and characteristics of Stochastic Gradient Descent (SGD)\nfor noise tolerance, a Parameter Linear Prediction (PLP) method is exploit to\nperform DNN parameter prediction. Finally, validations are carried out on some\nrepresentative backbones. Experiment results show that compare to the normal\ntraining ways, under the same training conditions and epochs, by employing\nproposed PLP method, the optimal model is able to obtain average about 1%\naccuracy improvement and 0.01 top-1/top-5 error reduction for Vgg16, Resnet18\nand GoogLeNet based on CIFAR-100 dataset, which shown the effectiveness of the\nproposed method on different DNN structures, and validated its capacity in\nenhancing DNN training efficiency and performance.\n","authors":["Hejie Ying","Mengmeng Song","Yaohong Tang","Shungen Xiao","Zimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.10958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06659v3","updated":"2024-07-02T16:51:11Z","published":"2024-03-11T12:28:55Z","title":"Zero-Shot ECG Classification with Multimodal Learning and Test-time\n Clinical Knowledge Enhancement","summary":" Electrocardiograms (ECGs) are non-invasive diagnostic tools crucial for\ndetecting cardiac arrhythmic diseases in clinical practice. While ECG\nSelf-supervised Learning (eSSL) methods show promise in representation learning\nfrom unannotated ECG data, they often overlook the clinical knowledge that can\nbe found in reports. This oversight and the requirement for annotated samples\nfor downstream tasks limit eSSL's versatility. In this work, we address these\nissues with the Multimodal ECG Representation Learning (MERL}) framework.\nThrough multimodal learning on ECG records and associated reports, MERL is\ncapable of performing zero-shot ECG classification with text prompts,\neliminating the need for training data in downstream tasks. At test time, we\npropose the Clinical Knowledge Enhanced Prompt Engineering (CKEPE) approach,\nwhich uses Large Language Models (LLMs) to exploit external expert-verified\nclinical knowledge databases, generating more descriptive prompts and reducing\nhallucinations in LLM-generated content to boost zero-shot classification.\nBased on MERL, we perform the first benchmark across six public ECG datasets,\nshowing the superior performance of MERL compared against eSSL methods.\nNotably, MERL achieves an average AUC score of 75.2% in zero-shot\nclassification (without training data), 3.2% higher than linear probed eSSL\nmethods with 10\\% annotated training data, averaged across all six datasets.\nCode and models are available at https://github.com/cheliu-computation/MERL\n","authors":["Che Liu","Zhongwei Wan","Cheng Ouyang","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2403.06659v3.pdf","comment":"Accepted by ICML2024"},{"id":"http://arxiv.org/abs/2407.02424v1","updated":"2024-07-02T16:50:27Z","published":"2024-07-02T16:50:27Z","title":"A Pattern Language for Machine Learning Tasks","summary":" Idealised as universal approximators, learners such as neural networks can be\nviewed as \"variable functions\" that may become one of a range of concrete\nfunctions after training. In the same way that equations constrain the possible\nvalues of variables in algebra, we may view objective functions as constraints\non the behaviour of learners. We extract the equivalences perfectly optimised\nobjective functions impose, calling them \"tasks\". For these tasks, we develop a\nformal graphical language that allows us to: (1) separate the core tasks of a\nbehaviour from its implementation details; (2) reason about and design\nbehaviours model-agnostically; and (3) simply describe and unify approaches in\nmachine learning across domains.\n As proof-of-concept, we design a novel task that enables converting\nclassifiers into generative models we call \"manipulators\", which we implement\nby directly translating task specifications into code. The resulting models\nexhibit capabilities such as style transfer and interpretable latent-space\nediting, without the need for custom architectures, adversarial training or\nrandom sampling. We formally relate the behaviour of manipulators to GANs, and\nempirically demonstrate their competitive performance with VAEs. We report on\nexperiments across vision and language domains aiming to characterise\nmanipulators as approximate Bayesian inversions of discriminative classifiers.\n","authors":["Benjamin Rodatz","Ian Fan","Tuomas Laakkonen","Neil John Ortega","Thomas Hoffman","Vincent Wang-Mascianica"],"pdf_url":"https://arxiv.org/pdf/2407.02424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02423v1","updated":"2024-07-02T16:50:26Z","published":"2024-07-02T16:50:26Z","title":"On the Anatomy of Attention","summary":" We introduce a category-theoretic diagrammatic formalism in order to\nsystematically relate and reason about machine learning models. Our diagrams\npresent architectures intuitively but without loss of essential detail, where\nnatural relationships between models are captured by graphical transformations,\nand important differences and similarities can be identified at a glance. In\nthis paper, we focus on attention mechanisms: translating folklore into\nmathematical derivations, and constructing a taxonomy of attention variants in\nthe literature. As a first example of an empirical investigation underpinned by\nour formalism, we identify recurring anatomical components of attention, which\nwe exhaustively recombine to explore a space of variations on the attention\nmechanism.\n","authors":["Nikhil Khatri","Tuomas Laakkonen","Jonathon Liu","Vincent Wang-Maścianica"],"pdf_url":"https://arxiv.org/pdf/2407.02423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02419v1","updated":"2024-07-02T16:44:14Z","published":"2024-07-02T16:44:14Z","title":"Quantum Curriculum Learning","summary":" Quantum machine learning (QML) requires significant quantum resources to\nachieve quantum advantage. Research should prioritize both the efficient design\nof quantum architectures and the development of learning strategies to optimize\nresource usage. We propose a framework called quantum curriculum learning\n(Q-CurL) for quantum data, where the curriculum introduces simpler tasks or\ndata to the learning model before progressing to more challenging ones. We\ndefine the curriculum criteria based on the data density ratio between tasks to\ndetermine the curriculum order. We also implement a dynamic learning schedule\nto emphasize the significance of quantum data in optimizing the loss function.\nEmpirical evidence shows that Q-CurL enhances the training convergence and the\ngeneralization for unitary learning tasks and improves the robustness of\nquantum phase recognition tasks. Our framework provides a general learning\nstrategy, bringing QML closer to realizing practical advantages.\n","authors":["Quoc Hoan Tran","Yasuhiro Endo","Hirotaka Oshima"],"pdf_url":"https://arxiv.org/pdf/2407.02419v1.pdf","comment":"main 5 pages, supplementary materials 6 pages"},{"id":"http://arxiv.org/abs/2406.17523v2","updated":"2024-07-02T16:33:26Z","published":"2024-06-25T13:06:09Z","title":"On the consistency of hyper-parameter selection in value-based deep\n reinforcement learning","summary":" Deep reinforcement learning (deep RL) has achieved tremendous success on\nvarious domains through a combination of algorithmic design and careful\nselection of hyper-parameters. Algorithmic improvements are often the result of\niterative enhancements built upon prior approaches, while hyper-parameter\nchoices are typically inherited from previous methods or fine-tuned\nspecifically for the proposed technique. Despite their crucial impact on\nperformance, hyper-parameter choices are frequently overshadowed by algorithmic\nadvancements. This paper conducts an extensive empirical study focusing on the\nreliability of hyper-parameter selection for value-based deep reinforcement\nlearning agents, including the introduction of a new score to quantify the\nconsistency and reliability of various hyper-parameters. Our findings not only\nhelp establish which hyper-parameters are most critical to tune, but also help\nclarify which tunings remain consistent across different training regimes.\n","authors":["Johan Obando-Ceron","João G. M. Araújo","Aaron Courville","Pablo Samuel Castro"],"pdf_url":"https://arxiv.org/pdf/2406.17523v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02408v1","updated":"2024-07-02T16:31:37Z","published":"2024-07-02T16:31:37Z","title":"CEB: Compositional Evaluation Benchmark for Fairness in Large Language\n Models","summary":" As Large Language Models (LLMs) are increasingly deployed to handle various\nnatural language processing (NLP) tasks, concerns regarding the potential\nnegative societal impacts of LLM-generated content have also arisen. To\nevaluate the biases exhibited by LLMs, researchers have recently proposed a\nvariety of datasets. However, existing bias evaluation efforts often focus on\nonly a particular type of bias and employ inconsistent evaluation metrics,\nleading to difficulties in comparison across different datasets and LLMs. To\naddress these limitations, we collect a variety of datasets designed for the\nbias evaluation of LLMs, and further propose CEB, a Compositional Evaluation\nBenchmark that covers different types of bias across different social groups\nand tasks. The curation of CEB is based on our newly proposed compositional\ntaxonomy, which characterizes each dataset from three dimensions: bias types,\nsocial groups, and tasks. By combining the three dimensions, we develop a\ncomprehensive evaluation strategy for the bias in LLMs. Our experiments\ndemonstrate that the levels of bias vary across these dimensions, thereby\nproviding guidance for the development of specific bias mitigation methods.\n","authors":["Song Wang","Peng Wang","Tong Zhou","Yushun Dong","Zhen Tan","Jundong Li"],"pdf_url":"https://arxiv.org/pdf/2407.02408v1.pdf","comment":"37 pages, 32 figures"},{"id":"http://arxiv.org/abs/2407.02405v1","updated":"2024-07-02T16:24:57Z","published":"2024-07-02T16:24:57Z","title":"Tiny-PULP-Dronets: Squeezing Neural Networks for Faster and Lighter\n Inference on Multi-Tasking Autonomous Nano-Drones","summary":" Pocket-sized autonomous nano-drones can revolutionize many robotic use cases,\nsuch as visual inspection in narrow, constrained spaces, and ensure safer\nhuman-robot interaction due to their tiny form factor and weight -- i.e., tens\nof grams. This compelling vision is challenged by the high level of\nintelligence needed aboard, which clashes against the limited computational and\nstorage resources available on PULP (parallel-ultra-low-power) MCU class\nnavigation and mission controllers that can be hosted aboard. This work moves\nfrom PULP-Dronet, a State-of-the-Art convolutional neural network for\nautonomous navigation on nano-drones. We introduce Tiny-PULP-Dronet: a novel\nmethodology to squeeze by more than one order of magnitude model size (50x\nfewer parameters), and number of operations (27x less multiply-and-accumulate)\nrequired to run inference with similar flight performance as PULP-Dronet. This\nmassive reduction paves the way towards affordable multi-tasking on\nnano-drones, a fundamental requirement for achieving high-level intelligence.\n","authors":["Lorenzo Lamberti","Vlad Niculescu","Michał Barcis","Lorenzo Bellone","Enrico Natalizio","Luca Benini","Daniele Palossi"],"pdf_url":"https://arxiv.org/pdf/2407.02405v1.pdf","comment":"3 Figures, 1 table. Accepted for publication at IEEE Artificial\n Intelligence Circuits and Systems (AICAS), 2022"},{"id":"http://arxiv.org/abs/2406.16976v2","updated":"2024-07-02T16:12:38Z","published":"2024-06-23T06:22:49Z","title":"Efficient Evolutionary Search Over Chemical Space with Large Language\n Models","summary":" Molecular discovery, when formulated as an optimization problem, presents\nsignificant computational challenges because optimization objectives can be\nnon-differentiable. Evolutionary Algorithms (EAs), often used to optimize\nblack-box objectives in molecular discovery, traverse chemical space by\nperforming random mutations and crossovers, leading to a large number of\nexpensive objective evaluations. In this work, we ameliorate this shortcoming\nby incorporating chemistry-aware Large Language Models (LLMs) into EAs. Namely,\nwe redesign crossover and mutation operations in EAs using LLMs trained on\nlarge corpora of chemical information. We perform extensive empirical studies\non both commercial and open-source models on multiple tasks involving property\noptimization, molecular rediscovery, and structure-based drug design,\ndemonstrating that the joint usage of LLMs with EAs yields superior performance\nover all baseline models across single- and multi-objective settings. We\ndemonstrate that our algorithm improves both the quality of the final solution\nand convergence speed, thereby reducing the number of required objective\nevaluations. Our code is available at http://github.com/zoom-wang112358/MOLLEO\n","authors":["Haorui Wang","Marta Skreta","Cher-Tian Ser","Wenhao Gao","Lingkai Kong","Felix Strieth-Kalthoff","Chenru Duan","Yuchen Zhuang","Yue Yu","Yanqiao Zhu","Yuanqi Du","Alán Aspuru-Guzik","Kirill Neklyudov","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.16976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02390v1","updated":"2024-07-02T16:04:16Z","published":"2024-07-02T16:04:16Z","title":"Uncertainty-Aware Decarbonization for Datacenters","summary":" This paper represents the first effort to quantify uncertainty in carbon\nintensity forecasting for datacenter decarbonization. We identify and analyze\ntwo types of uncertainty -- temporal and spatial -- and discuss their system\nimplications. To address the temporal dynamics in quantifying uncertainty for\ncarbon intensity forecasting, we introduce a conformal prediction-based\nframework. Evaluation results show that our technique robustly achieves target\ncoverages in uncertainty quantification across various significance levels. We\nconduct two case studies using production power traces, focusing on temporal\nand spatial load shifting respectively. The results show that incorporating\nuncertainty into scheduling decisions can prevent a 5% and 14% increase in\ncarbon emissions, respectively. These percentages translate to an absolute\nreduction of 2.1 and 10.4 tons of carbon emissions in a 20 MW datacenter\ncluster.\n","authors":["Amy Li","Sihang Liu","Yi Ding"],"pdf_url":"https://arxiv.org/pdf/2407.02390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14277v3","updated":"2024-07-02T16:02:39Z","published":"2023-09-25T16:40:56Z","title":"SINCERE: Supervised Information Noise-Contrastive Estimation REvisited","summary":" The information noise-contrastive estimation (InfoNCE) loss function provides\nthe basis of many self-supervised deep learning methods due to its strong\nempirical results and theoretic motivation. Previous work suggests a supervised\ncontrastive (SupCon) loss to extend InfoNCE to learn from available class\nlabels. This SupCon loss has been widely-used due to reports of good empirical\nperformance. However, in this work we find that the prior SupCon loss\nformulation has questionable justification because it can encourage some images\nfrom the same class to repel one another in the learned embedding space. This\nproblematic intra-class repulsion gets worse as the number of images sharing\none class label increases. We propose the Supervised InfoNCE REvisited\n(SINCERE) loss as a theoretically-justified supervised extension of InfoNCE\nthat eliminates intra-class repulsion. Experiments show that SINCERE leads to\nbetter separation of embeddings from different classes and improves transfer\nlearning classification accuracy. We additionally utilize probabilistic\nmodeling to derive an information-theoretic bound that relates SINCERE loss to\nthe symmeterized KL divergence between data-generating distributions for a\ntarget class and all other classes.\n","authors":["Patrick Feeney","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2309.14277v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02389v1","updated":"2024-07-02T16:02:25Z","published":"2024-07-02T16:02:25Z","title":"SafaRi:Adaptive Sequence Transformer for Weakly Supervised Referring\n Expression Segmentation","summary":" Referring Expression Segmentation (RES) aims to provide a segmentation mask\nof the target object in an image referred to by the text (i.e., referring\nexpression). Existing methods require large-scale mask annotations. Moreover,\nsuch approaches do not generalize well to unseen/zero-shot scenarios. To\naddress the aforementioned issues, we propose a weakly-supervised bootstrapping\narchitecture for RES with several new algorithmic innovations. To the best of\nour knowledge, ours is the first approach that considers only a fraction of\nboth mask and box annotations (shown in Figure 1 and Table 1) for training. To\nenable principled training of models in such low-annotation settings, improve\nimage-text region-level alignment, and further enhance spatial localization of\nthe target object in the image, we propose Cross-modal Fusion with Attention\nConsistency module. For automatic pseudo-labeling of unlabeled samples, we\nintroduce a novel Mask Validity Filtering routine based on a spatially aware\nzero-shot proposal scoring approach. Extensive experiments show that with just\n30% annotations, our model SafaRi achieves 59.31 and 48.26 mIoUs as compared to\n58.93 and 48.19 mIoUs obtained by the fully-supervised SOTA method SeqTR\nrespectively on RefCOCO+@testA and RefCOCO+testB datasets. SafaRi also\noutperforms SeqTR by 11.7% (on RefCOCO+testA) and 19.6% (on RefCOCO+testB) in a\nfully-supervised setting and demonstrates strong generalization capabilities in\nunseen/zero-shot tasks.\n","authors":["Sayan Nag","Koustava Goswami","Srikrishna Karanam"],"pdf_url":"https://arxiv.org/pdf/2407.02389v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2207.12067v3","updated":"2024-07-02T15:46:13Z","published":"2022-07-25T11:22:48Z","title":"Homomorphism Autoencoder -- Learning Group Structured Representations\n from Observed Transitions","summary":" How can agents learn internal models that veridically represent interactions\nwith the real world is a largely open question. As machine learning is moving\ntowards representations containing not just observational but also\ninterventional knowledge, we study this problem using tools from representation\nlearning and group theory. We propose methods enabling an agent acting upon the\nworld to learn internal representations of sensory information that are\nconsistent with actions that modify it. We use an autoencoder equipped with a\ngroup representation acting on its latent space, trained using an\nequivariance-derived loss in order to enforce a suitable homomorphism property\non the group representation. In contrast to existing work, our approach does\nnot require prior knowledge of the group and does not restrict the set of\nactions the agent can perform. We motivate our method theoretically, and show\nempirically that it can learn a group representation of the actions, thereby\ncapturing the structure of the set of transformations applied to the\nenvironment. We further show that this allows agents to predict the effect of\nsequences of future actions with improved accuracy.\n","authors":["Hamza Keurti","Hsiao-Ru Pan","Michel Besserve","Benjamin F. Grewe","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2207.12067v3.pdf","comment":"Accepted at ICML2023, Presented at the Symmetry and Geometry in\n Neural Representations Workshop (NeurReps) @ NeurIPS2022, 26 pages, 17\n figures"},{"id":"http://arxiv.org/abs/2407.01392v2","updated":"2024-07-02T15:39:29Z","published":"2024-07-01T15:43:25Z","title":"Diffusion Forcing: Next-token Prediction Meets Full-Sequence Diffusion","summary":" This paper presents Diffusion Forcing, a new training paradigm where a\ndiffusion model is trained to denoise a set of tokens with independent\nper-token noise levels. We apply Diffusion Forcing to sequence generative\nmodeling by training a causal next-token prediction model to generate one or\nseveral future tokens without fully diffusing past ones. Our approach is shown\nto combine the strengths of next-token prediction models, such as\nvariable-length generation, with the strengths of full-sequence diffusion\nmodels, such as the ability to guide sampling to desirable trajectories. Our\nmethod offers a range of additional capabilities, such as (1) rolling-out\nsequences of continuous tokens, such as video, with lengths past the training\nhorizon, where baselines diverge and (2) new sampling and guiding schemes that\nuniquely profit from Diffusion Forcing's variable-horizon and causal\narchitecture, and which lead to marked performance gains in decision-making and\nplanning tasks. In addition to its empirical success, our method is proven to\noptimize a variational lower bound on the likelihoods of all subsequences of\ntokens drawn from the true joint distribution. Project website:\nhttps://boyuan.space/diffusion-forcing/\n","authors":["Boyuan Chen","Diego Marti Monso","Yilun Du","Max Simchowitz","Russ Tedrake","Vincent Sitzmann"],"pdf_url":"https://arxiv.org/pdf/2407.01392v2.pdf","comment":"Project website: https://boyuan.space/diffusion-forcing/"},{"id":"http://arxiv.org/abs/2407.02369v1","updated":"2024-07-02T15:39:00Z","published":"2024-07-02T15:39:00Z","title":"Two-Step Q-Learning","summary":" Q-learning is a stochastic approximation version of the classic value\niteration. The literature has established that Q-learning suffers from both\nmaximization bias and slower convergence. Recently, multi-step algorithms have\nshown practical advantages over existing methods. This paper proposes a novel\noff-policy two-step Q-learning algorithms, without importance sampling. With\nsuitable assumption it was shown that, iterates in the proposed two-step\nQ-learning is bounded and converges almost surely to the optimal Q-values. This\nstudy also address the convergence analysis of the smooth version of two-step\nQ-learning, i.e., by replacing max function with the log-sum-exp function. The\nproposed algorithms are robust and easy to implement. Finally, we test the\nproposed algorithms on benchmark problems such as the roulette problem,\nmaximization bias problem, and randomly generated Markov decision processes and\ncompare it with the existing methods available in literature. Numerical\nexperiments demonstrate the superior performance of both the two-step\nQ-learning and its smooth variants.\n","authors":["Antony Vijesh","Shreyas S R"],"pdf_url":"https://arxiv.org/pdf/2407.02369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14325v2","updated":"2024-07-02T15:36:32Z","published":"2024-06-20T13:56:42Z","title":"Reproducibility in Machine Learning-based Research: Overview, Barriers\n and Drivers","summary":" Research in various fields is currently experiencing challenges regarding the\nreproducibility of results. This problem is also prevalent in machine learning\n(ML) research. The issue arises, for example, due to unpublished data and/or\nsource code and the sensitivity of ML training conditions. Although different\nsolutions have been proposed to address this issue, such as using ML platforms,\nthe level of reproducibility in ML-driven research remains unsatisfactory.\nTherefore, in this article, we discuss the reproducibility of ML-driven\nresearch with three main aims: (i) identifying the barriers to reproducibility\nwhen applying ML in research as well as categorize the barriers to different\ntypes of reproducibility (description, code, data, and experiment\nreproducibility), (ii) discussing potential drivers such as tools, practices,\nand interventions that support ML reproducibility, as well as distinguish\nbetween technology-driven drivers, procedural drivers, and drivers related to\nawareness and education, and (iii) mapping the drivers to the barriers. With\nthis work, we hope to provide insights and to contribute to the decision-making\nprocess regarding the adoption of different solutions to support ML\nreproducibility.\n","authors":["Harald Semmelrock","Tony Ross-Hellauer","Simone Kopeinik","Dieter Theiler","Armin Haberl","Stefan Thalmann","Dominik Kowald"],"pdf_url":"https://arxiv.org/pdf/2406.14325v2.pdf","comment":"Pre-print of submission for the AI Magazine - comments to this\n pre-print are very welcome"},{"id":"http://arxiv.org/abs/2407.02362v1","updated":"2024-07-02T15:28:10Z","published":"2024-07-02T15:28:10Z","title":"Fast, Scalable, Energy-Efficient Non-element-wise Matrix Multiplication\n on FPGA","summary":" Modern Neural Network (NN) architectures heavily rely on vast numbers of\nmultiply-accumulate arithmetic operations, constituting the predominant\ncomputational cost. Therefore, this paper proposes a high-throughput, scalable\nand energy efficient non-element-wise matrix multiplication unit on FPGAs as a\nbasic component of the NNs. We firstly streamline inter-layer and intra-layer\nredundancies of MADDNESS algorithm, a LUT-based approximate matrix\nmultiplication, to design a fast, efficient scalable approximate matrix\nmultiplication module termed \"Approximate Multiplication Unit (AMU)\". The AMU\noptimizes LUT-based matrix multiplications further through dedicated memory\nmanagement and access design, decoupling computational overhead from input\nresolution and boosting FPGA-based NN accelerator efficiency significantly. The\nexperimental results show that using our AMU achieves up to 9x higher\nthroughput and 112x higher energy efficiency over the state-of-the-art\nsolutions for the FPGA-based Quantised Neural Network (QNN) accelerators.\n","authors":["Xuqi Zhu","Huaizhi Zhang","JunKyu Lee","Jiacheng Zhu","Chandrajit Pal","Sangeet Saha","Klaus D. McDonald-Maier","Xiaojun Zhai"],"pdf_url":"https://arxiv.org/pdf/2407.02362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02356v1","updated":"2024-07-02T15:21:11Z","published":"2024-07-02T15:21:11Z","title":"Enable the Right to be Forgotten with Federated Client Unlearning in\n Medical Imaging","summary":" The right to be forgotten, as stated in most data regulations, poses an\nunderexplored challenge in federated learning (FL), leading to the development\nof federated unlearning (FU). However, current FU approaches often face\ntrade-offs between efficiency, model performance, forgetting efficacy, and\nprivacy preservation. In this paper, we delve into the paradigm of Federated\nClient Unlearning (FCU) to guarantee a client the right to erase the\ncontribution or the influence, introducing the first FU framework in medical\nimaging. In the unlearning process of a client, the proposed model-contrastive\nunlearning marks a pioneering step towards feature-level unlearning, and\nfrequency-guided memory preservation ensures smooth forgetting of local\nknowledge while maintaining the generalizability of the trained global model,\nthus avoiding performance compromises and guaranteeing rapid post-training. We\nevaluated our FCU framework on two public medical image datasets, including\nIntracranial hemorrhage diagnosis and skin lesion diagnosis, demonstrating that\nour framework outperformed other state-of-the-art FU frameworks, with an\nexpected speed-up of 10-15 times compared with retraining from scratch. The\ncode and the organized datasets can be found at:\nhttps://github.com/dzp2095/FCU.\n","authors":["Zhipeng Deng","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2407.02356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01458v2","updated":"2024-07-02T15:17:50Z","published":"2024-07-01T16:53:00Z","title":"Contractual Reinforcement Learning: Pulling Arms with Invisible Hands","summary":" The agency problem emerges in today's large scale machine learning tasks,\nwhere the learners are unable to direct content creation or enforce data\ncollection. In this work, we propose a theoretical framework for aligning\neconomic interests of different stakeholders in the online learning problems\nthrough contract design. The problem, termed \\emph{contractual reinforcement\nlearning}, naturally arises from the classic model of Markov decision\nprocesses, where a learning principal seeks to optimally influence the agent's\naction policy for their common interests through a set of payment rules\ncontingent on the realization of next state. For the planning problem, we\ndesign an efficient dynamic programming algorithm to determine the optimal\ncontracts against the far-sighted agent. For the learning problem, we introduce\na generic design of no-regret learning algorithms to untangle the challenges\nfrom robust design of contracts to the balance of exploration and exploitation,\nreducing the complexity analysis to the construction of efficient search\nalgorithms. For several natural classes of problems, we design tailored search\nalgorithms that provably achieve $\\tilde{O}(\\sqrt{T})$ regret. We also present\nan algorithm with $\\tilde{O}(T^{2/3})$ for the general problem that improves\nthe existing analysis in online contract design with mild technical\nassumptions.\n","authors":["Jibang Wu","Siyu Chen","Mengdi Wang","Huazheng Wang","Haifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2407.01458v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02348v1","updated":"2024-07-02T15:14:12Z","published":"2024-07-02T15:14:12Z","title":"Revisiting Cascaded Ensembles for Efficient Inference","summary":" A common approach to make machine learning inference more efficient is to use\nexample-specific adaptive schemes, which route or select models for each\nexample at inference time. In this work we study a simple scheme for adaptive\ninference. We build a cascade of ensembles (CoE), beginning with\nresource-efficient models and growing to larger, more expressive models, where\nensemble agreement serves as a data-dependent routing criterion. This scheme is\neasy to incorporate into existing inference pipelines, requires no additional\ntraining, and can be used to place models across multiple resource tiers--for\ninstance, serving efficient models at the edge and invoking larger models in\nthe cloud only when necessary. In cases where parallel inference is feasible,\nwe show that CoE can improve accuracy relative to the single best model while\nreducing the average cost of inference by up to 7x, and provides\nPareto-dominate solutions in accuracy and efficiency relative to existing\nadaptive inference baselines. These savings translate to an over 3x-reduction\nin total monetary cost when performing inference using a heterogeneous cluster\nof GPUs. Finally, for edge inference scenarios where portions of the cascade\nreside at the edge vs. in the cloud, CoE can provide a 14x reduction in\ncommunication cost and inference latency without sacrificing accuracy.\n","authors":["Steven Kolawole","Don Dennis","Ameet Talwalkar","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2407.02348v1.pdf","comment":"ES-FOMO, ICML 2024"},{"id":"http://arxiv.org/abs/2407.02335v1","updated":"2024-07-02T15:05:19Z","published":"2024-07-02T15:05:19Z","title":"CALICO: Confident Active Learning with Integrated Calibration","summary":" The growing use of deep learning in safety-critical applications, such as\nmedical imaging, has raised concerns about limited labeled data, where this\ndemand is amplified as model complexity increases, posing hurdles for domain\nexperts to annotate data. In response to this, active learning (AL) is used to\nefficiently train models with limited annotation costs. In the context of deep\nneural networks (DNNs), AL often uses confidence or probability outputs as a\nscore for selecting the most informative samples. However, modern DNNs exhibit\nunreliable confidence outputs, making calibration essential. We propose an AL\nframework that self-calibrates the confidence used for sample selection during\nthe training process, referred to as Confident Active Learning with Integrated\nCalibratiOn (CALICO). CALICO incorporates the joint training of a classifier\nand an energy-based model, instead of the standard softmax-based classifier.\nThis approach allows for simultaneous estimation of the input data distribution\nand the class probabilities during training, improving calibration without\nneeding an additional labeled dataset. Experimental results showcase improved\nclassification performance compared to a softmax-based classifier with fewer\nlabeled samples. Furthermore, the calibration stability of the model is\nobserved to depend on the prior class distribution of the data.\n","authors":["Lorenzo S. Querol","Hajime Nagahara","Hideaki Hayashi"],"pdf_url":"https://arxiv.org/pdf/2407.02335v1.pdf","comment":"Accepted to ICANN2024"},{"id":"http://arxiv.org/abs/2407.02327v1","updated":"2024-07-02T14:56:47Z","published":"2024-07-02T14:56:47Z","title":"QSync: Quantization-Minimized Synchronous Distributed Training Across\n Hybrid Devices","summary":" A number of production deep learning clusters have attempted to explore\ninference hardware for DNN training, at the off-peak serving hours with many\ninference GPUs idling. Conducting DNN training with a combination of\nheterogeneous training and inference GPUs, known as hybrid device training,\npresents considerable challenges due to disparities in compute capability and\nsignificant differences in memory capacity. We propose QSync, a training system\nthat enables efficient synchronous data-parallel DNN training over hybrid\ndevices by strategically exploiting quantized operators. According to each\ndevice's available resource capacity, QSync selects a quantization-minimized\nsetting for operators in the distributed DNN training graph, minimizing model\naccuracy degradation but keeping the training efficiency brought by\nquantization. We carefully design a predictor with a bi-directional\nmixed-precision indicator to reflect the sensitivity of DNN layers on\nfixed-point and floating-point low-precision operators, a replayer with a\nneighborhood-aware cost mapper to accurately estimate the latency of\ndistributed hybrid mixed-precision training, and then an allocator that\nefficiently synchronizes workers with minimized model accuracy degradation.\nQSync bridges the computational graph on PyTorch to an optimized backend for\nquantization kernel performance and flexible support for various GPU\narchitectures. Extensive experiments show that QSync's predictor can accurately\nsimulate distributed mixed-precision training with <5% error, with a consistent\n0.27-1.03% accuracy improvement over the from-scratch training tasks compared\nto uniform precision.\n","authors":["Juntao Zhao","Borui Wan","Yanghua Peng","Haibin Lin","Yibo Zhu","Chuan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.02327v1.pdf","comment":"IPDPS 24"},{"id":"http://arxiv.org/abs/2407.02322v1","updated":"2024-07-02T14:52:21Z","published":"2024-07-02T14:52:21Z","title":"Stochastic Differential Equations models for Least-Squares Stochastic\n Gradient Descent","summary":" We study the dynamics of a continuous-time model of the Stochastic Gradient\nDescent (SGD) for the least-square problem. Indeed, pursuing the work of Li et\nal. (2019), we analyze Stochastic Differential Equations (SDEs) that model SGD\neither in the case of the training loss (finite samples) or the population one\n(online setting). A key qualitative feature of the dynamics is the existence of\na perfect interpolator of the data, irrespective of the sample size. In both\nscenarios, we provide precise, non-asymptotic rates of convergence to the\n(possibly degenerate) stationary distribution. Additionally, we describe this\nasymptotic distribution, offering estimates of its mean, deviations from it,\nand a proof of the emergence of heavy-tails related to the step-size magnitude.\nNumerical simulations supporting our findings are also presented.\n","authors":["Adrien Schertzer","Loucas Pillaud-Vivien"],"pdf_url":"https://arxiv.org/pdf/2407.02322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02309v1","updated":"2024-07-02T14:44:01Z","published":"2024-07-02T14:44:01Z","title":"Semantically Guided Representation Learning For Action Anticipation","summary":" Action anticipation is the task of forecasting future activity from a\npartially observed sequence of events. However, this task is exposed to\nintrinsic future uncertainty and the difficulty of reasoning upon\ninterconnected actions. Unlike previous works that focus on extrapolating\nbetter visual and temporal information, we concentrate on learning action\nrepresentations that are aware of their semantic interconnectivity based on\nprototypical action patterns and contextual co-occurrences. To this end, we\npropose the novel Semantically Guided Representation Learning (S-GEAR)\nframework. S-GEAR learns visual action prototypes and leverages language models\nto structure their relationship, inducing semanticity. To gather insights on\nS-GEAR's effectiveness, we test it on four action anticipation benchmarks,\nobtaining improved results compared to previous works: +3.5, +2.7, and +3.5\nabsolute points on Top-1 Accuracy on Epic-Kitchen 55, EGTEA Gaze+ and 50\nSalads, respectively, and +0.8 on Top-5 Recall on Epic-Kitchens 100. We further\nobserve that S-GEAR effectively transfers the geometric associations between\nactions from language to visual prototypes. Finally, S-GEAR opens new research\nfrontiers in anticipation tasks by demonstrating the intricate impact of action\nsemantic interconnectivity.\n","authors":["Anxhelo Diko","Danilo Avola","Bardh Prenkaj","Federico Fontana","Luigi Cinque"],"pdf_url":"https://arxiv.org/pdf/2407.02309v1.pdf","comment":"Accepted as a full paper at ECCV'24 with Paper ID #4140"},{"id":"http://arxiv.org/abs/2405.10938v2","updated":"2024-07-02T14:16:42Z","published":"2024-05-17T17:49:44Z","title":"Observational Scaling Laws and the Predictability of Language Model\n Performance","summary":" Understanding how language model performance varies with scale is critical to\nbenchmark and algorithm development. Scaling laws are one approach to building\nthis understanding, but the requirement of training models across many\ndifferent scales has limited their use. We propose an alternative,\nobservational approach that bypasses model training and instead builds scaling\nlaws from ~80 publically available models. Building a single scaling law from\nmultiple model families is challenging due to large variations in their\ntraining compute efficiencies and capabilities. However, we show that these\nvariations are consistent with a simple, generalized scaling law where language\nmodel performance is a function of a low-dimensional capability space, and\nmodel families only vary in their efficiency in converting training compute to\ncapabilities. Using this approach, we show the surprising predictability of\ncomplex scaling phenomena: we show that several emergent phenomena follow a\nsmooth, sigmoidal behavior and are predictable from small models; we show that\nthe agent performance of models such as GPT-4 can be precisely predicted from\nsimpler non-agentic benchmarks; and we show how to predict the impact of\npost-training interventions like Chain-of-Thought and Self-Consistency as\nlanguage model capabilities continue to improve.\n","authors":["Yangjun Ruan","Chris J. Maddison","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2405.10938v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05512v3","updated":"2024-07-02T14:14:41Z","published":"2024-05-09T02:41:42Z","title":"Characteristic Learning for Provable One Step Generation","summary":" We propose the characteristic generator, a novel one-step generative model\nthat combines the efficiency of sampling in Generative Adversarial Networks\n(GANs) with the stable performance of flow-based models. Our model is driven by\ncharacteristics, along which the probability density transport can be described\nby ordinary differential equations (ODEs). Specifically, We estimate the\nvelocity field through nonparametric regression and utilize Euler method to\nsolve the probability flow ODE, generating a series of discrete approximations\nto the characteristics. We then use a deep neural network to fit these\ncharacteristics, ensuring a one-step mapping that effectively pushes the prior\ndistribution towards the target distribution. In the theoretical aspect, we\nanalyze the errors in velocity matching, Euler discretization, and\ncharacteristic fitting to establish a non-asymptotic convergence rate for the\ncharacteristic generator in 2-Wasserstein distance. To the best of our\nknowledge, this is the first thorough analysis for simulation-free one step\ngenerative models. Additionally, our analysis refines the error analysis of\nflow-based generative models in prior works. We apply our method on both\nsynthetic and real datasets, and the results demonstrate that the\ncharacteristic generator achieves high generation quality with just a single\nevaluation of neural network.\n","authors":["Zhao Ding","Chenguang Duan","Yuling Jiao","Ruoxuan Li","Jerry Zhijian Yang","Pingwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.05512v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02279v1","updated":"2024-07-02T14:08:23Z","published":"2024-07-02T14:08:23Z","title":"How to Boost Any Loss Function","summary":" Boosting is a highly successful ML-born optimization setting in which one is\nrequired to computationally efficiently learn arbitrarily good models based on\nthe access to a weak learner oracle, providing classifiers performing at least\nslightly differently from random guessing. A key difference with gradient-based\noptimization is that boosting's original model does not requires access to\nfirst order information about a loss, yet the decades long history of boosting\nhas quickly evolved it into a first order optimization setting -- sometimes\neven wrongfully \\textit{defining} it as such. Owing to recent progress\nextending gradient-based optimization to use only a loss' zeroth ($0^{th}$)\norder information to learn, this begs the question: what loss functions can be\nefficiently optimized with boosting and what is the information really needed\nfor boosting to meet the \\textit{original} boosting blueprint's requirements?\n We provide a constructive formal answer essentially showing that \\textit{any}\nloss function can be optimized with boosting and thus boosting can achieve a\nfeat not yet known to be possible in the classical $0^{th}$ order setting,\nsince loss functions are not required to be be convex, nor differentiable or\nLipschitz -- and in fact not required to be continuous either. Some tools we\nuse are rooted in quantum calculus, the mathematical field -- not to be\nconfounded with quantum computation -- that studies calculus without passing to\nthe limit, and thus without using first order information.\n","authors":["Richard Nock","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2407.02279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02275v1","updated":"2024-07-02T14:05:10Z","published":"2024-07-02T14:05:10Z","title":"Learning Paradigms and Modelling Methodologies for Digital Twins in\n Process Industry","summary":" Central to the digital transformation of the process industry are Digital\nTwins (DTs), virtual replicas of physical manufacturing systems that combine\nsensor data with sophisticated data-based or physics-based models, or a\ncombination thereof, to tackle a variety of industrial-relevant tasks like\nprocess monitoring, predictive control or decision support. The backbone of a\nDT, i.e. the concrete modelling methodologies and architectural frameworks\nsupporting these models, are complex, diverse and evolve fast, necessitating a\nthorough understanding of the latest state-of-the-art methods and trends to\nstay on top of a highly competitive market. From a research perspective,\ndespite the high research interest in reviewing various aspects of DTs,\nstructured literature reports specifically focusing on unravelling the utilized\nlearning paradigms (e.g. self-supervised learning) for DT-creation in the\nprocess industry are a novel contribution in this field. This study aims to\naddress these gaps by (1) systematically analyzing the modelling methodologies\n(e.g. Convolutional Neural Network, Encoder-Decoder, Hidden Markov Model) and\nparadigms (e.g. data-driven, physics-based, hybrid) used for DT-creation; (2)\nassessing the utilized learning strategies (e.g. supervised, unsupervised,\nself-supervised); (3) analyzing the type of modelling task (e.g. regression,\nclassification, clustering); and (4) identifying the challenges and research\ngaps, as well as, discuss potential resolutions provided.\n","authors":["Michael Mayr","Georgios C. Chasparis","Josef Küng"],"pdf_url":"https://arxiv.org/pdf/2407.02275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02271v1","updated":"2024-07-02T13:59:09Z","published":"2024-07-02T13:59:09Z","title":"Improving Explainability of Softmax Classifiers Using a Prototype-Based\n Joint Embedding Method","summary":" We propose a prototype-based approach for improving explainability of softmax\nclassifiers that provides an understandable prediction confidence, generated\nthrough stochastic sampling of prototypes, and demonstrates potential for out\nof distribution detection (OOD). By modifying the model architecture and\ntraining to make predictions using similarities to any set of class examples\nfrom the training dataset, we acquire the ability to sample for prototypical\nexamples that contributed to the prediction, which provide an instance-based\nexplanation for the model's decision. Furthermore, by learning relationships\nbetween images from the training dataset through relative distances within the\nmodel's latent space, we obtain a metric for uncertainty that is better able to\ndetect out of distribution data than softmax confidence.\n","authors":["Hilarie Sit","Brendan Keith","Karianne Bergen"],"pdf_url":"https://arxiv.org/pdf/2407.02271v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.02269v1","updated":"2024-07-02T13:58:28Z","published":"2024-07-02T13:58:28Z","title":"IFTT-PIN: A Self-Calibrating PIN-Entry Method","summary":" Personalising an interface to the needs and preferences of a user often\nincurs additional interaction steps. In this paper, we demonstrate a novel\nmethod that enables the personalising of an interface without the need for\nexplicit calibration procedures, via a process we call self-calibration. A\nsecond-order effect of self-calibration is that an outside observer cannot\neasily infer what a user is trying to achieve because they cannot interpret the\nuser's actions. To explore this security angle, we developed IFTT-PIN (If This\nThen PIN) as the first self-calibrating PIN-entry method. When using IFTT-PIN,\nusers are free to choose any button for any meaning without ever explicitly\ncommunicating their choice to the machine. IFTT-PIN infers both the user's PIN\nand their preferred button mapping at the same time. This paper presents the\nconcept, implementation, and interactive demonstrations of IFTT-PIN, as well as\nan evaluation against shoulder surfing attacks. Our study (N=24) shows that by\nadding self-calibration to an existing PIN entry method, IFTT-PIN statistically\nsignificantly decreased PIN attack decoding rate by ca. 8.5 times (p=1.1e-9),\nwhile only decreasing the PIN entry encoding rate by ca. 1.4 times (p=0.02),\nleading to a positive security-usability trade-off. IFTT-PIN's entry rate\nsignificantly improved 21 days after first exposure (p=3.6e-6) to the method,\nsuggesting self-calibrating interfaces are memorable despite using an initially\nundefined user interface. Self-calibration methods might lead to novel\nopportunities for interaction that are more inclusive and versatile, a\npotentially interesting challenge for the community. A short introductory video\nis available at https://youtu.be/pP5sfniNRns.\n","authors":["Kathryn McConkey","Talha Enes Ayranci","Mohamed Khamis","Jonathan Grizou"],"pdf_url":"https://arxiv.org/pdf/2407.02269v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2205.09534"},{"id":"http://arxiv.org/abs/2407.02265v1","updated":"2024-07-02T13:41:59Z","published":"2024-07-02T13:41:59Z","title":"DrugCLIP: Contrastive Drug-Disease Interaction For Drug Repurposing","summary":" Bringing a novel drug from the original idea to market typically requires\nmore than ten years and billions of dollars. To alleviate the heavy burden, a\nnatural idea is to reuse the approved drug to treat new diseases. The process\nis also known as drug repurposing or drug repositioning. Machine learning\nmethods exhibited huge potential in automating drug repurposing. However, it\nstill encounter some challenges, such as lack of labels and multimodal feature\nrepresentation. To address these issues, we design DrugCLIP, a cutting-edge\ncontrastive learning method, to learn drug and disease's interaction without\nnegative labels. Additionally, we have curated a drug repurposing dataset based\non real-world clinical trial records. Thorough empirical studies are conducted\nto validate the effectiveness of the proposed DrugCLIP method.\n","authors":["Yingzhou Lu","Yaojun Hu","Chenhao Li"],"pdf_url":"https://arxiv.org/pdf/2407.02265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02263v1","updated":"2024-07-02T13:40:29Z","published":"2024-07-02T13:40:29Z","title":"FreeCG: Free the Design Space of Clebsch-Gordan Transform for machine\n learning force field","summary":" The Clebsch-Gordan Transform (CG transform) effectively encodes many-body\ninteractions. Many studies have proven its accuracy in depicting atomic\nenvironments, although this comes with high computational needs. The\ncomputational burden of this challenge is hard to reduce due to the need for\npermutation equivariance, which limits the design space of the CG transform\nlayer. We show that, implementing the CG transform layer on\npermutation-invariant inputs allows complete freedom in the design of this\nlayer without affecting symmetry. Developing further on this premise, our idea\nis to create a CG transform layer that operates on permutation-invariant\nabstract edges generated from real edge information. We bring in group CG\ntransform with sparse path, abstract edges shuffling, and attention enhancer to\nform a powerful and efficient CG transform layer. Our method, known as FreeCG,\nachieves State-of-The-Art (SoTA) results in force prediction for MD17, rMD17,\nMD22, and property prediction in QM9 datasets with notable enhancement. It\nintroduces a novel paradigm for carrying out efficient and expressive CG\ntransform in future geometric neural network designs.\n","authors":["Shihao Shao","Haoran Geng","Qinghua Cui"],"pdf_url":"https://arxiv.org/pdf/2407.02263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12484v3","updated":"2024-07-02T13:33:39Z","published":"2021-10-24T16:38:05Z","title":"Enabling Large Batch Size Training for DNN Models Beyond the Memory\n Limit While Maintaining Performance","summary":" Recent deep learning models are difficult to train using a large batch size,\nbecause commodity machines may not have enough memory to accommodate both the\nmodel and a large data batch size. The batch size is one of the\nhyper-parameters used in the training model, and it is dependent on and is\nlimited by the target machine memory capacity because the batch size can only\nfit into the remaining memory after the model is uploaded. Moreover, the data\nitem size is also an important factor because if each data item size is larger\nthen the batch size that can fit into the remaining memory becomes smaller.\nThis paper proposes a method called Micro-Batch Processing (MBP) to address\nthis problem. This method helps deep learning models to train by providing a\nbatch processing method that splits a batch into a size that can fit in the\nremaining memory and processes them sequentially. After processing the small\nbatches individually, a loss normalization algorithm based on the gradient\naccumulation is used to maintain the performance. The purpose of our method is\nto allow deep learning models to train using larger batch sizes that exceed the\nmemory capacity of a system without increasing the memory size or using\nmultiple devices (GPUs).\n","authors":["XinYu Piao","DoangJoo Synn","JooYoung Park","Jong-Kook Kim"],"pdf_url":"https://arxiv.org/pdf/2110.12484v3.pdf","comment":"Published in IEEE Access"},{"id":"http://arxiv.org/abs/2405.03961v2","updated":"2024-07-02T13:28:28Z","published":"2024-05-07T02:48:15Z","title":"Structure-based drug design by denoising voxel grids","summary":" We present VoxBind, a new score-based generative model for 3D molecules\nconditioned on protein structures. Our approach represents molecules as 3D\natomic density grids and leverages a 3D voxel-denoising network for learning\nand generation. We extend the neural empirical Bayes formalism (Saremi &\nHyvarinen, 2019) to the conditional setting and generate structure-conditioned\nmolecules with a two-step procedure: (i) sample noisy molecules from the\nGaussian-smoothed conditional distribution with underdamped Langevin MCMC using\nthe learned score function and (ii) estimate clean molecules from the noisy\nsamples with single-step denoising. Compared to the current state of the art,\nour model is simpler to train, significantly faster to sample from, and\nachieves better results on extensive in silico benchmarks -- the generated\nmolecules are more diverse, exhibit fewer steric clashes, and bind with higher\naffinity to protein pockets. The code is available at\nhttps://github.com/genentech/voxbind/.\n","authors":["Pedro O. Pinheiro","Arian Jamasb","Omar Mahmood","Vishnu Sresht","Saeed Saremi"],"pdf_url":"https://arxiv.org/pdf/2405.03961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02258v1","updated":"2024-07-02T13:26:16Z","published":"2024-07-02T13:26:16Z","title":"SiamTST: A Novel Representation Learning Framework for Enhanced\n Multivariate Time Series Forecasting applied to Telco Networks","summary":" We introduce SiamTST, a novel representation learning framework for\nmultivariate time series. SiamTST integrates a Siamese network with attention,\nchannel-independent patching, and normalization techniques to achieve superior\nperformance. Evaluated on a real-world industrial telecommunication dataset,\nSiamTST demonstrates significant improvements in forecasting accuracy over\nexisting methods. Notably, a simple linear network also shows competitive\nperformance, achieving the second-best results, just behind SiamTST. The code\nis available at https://github.com/simenkristoff/SiamTST.\n","authors":["Simen Kristoffersen","Peter Skaar Nordby","Sara Malacarne","Massimiliano Ruocco","Pablo Ortiz"],"pdf_url":"https://arxiv.org/pdf/2407.02258v1.pdf","comment":"14 pages, 3 figures, public codebase"},{"id":"http://arxiv.org/abs/2404.17701v3","updated":"2024-07-02T13:25:00Z","published":"2024-04-26T20:59:23Z","title":"Embedded FPGA Developments in 130nm and 28nm CMOS for Machine Learning\n in Particle Detector Readout","summary":" Embedded field programmable gate array (eFPGA) technology allows the\nimplementation of reconfigurable logic within the design of an\napplication-specific integrated circuit (ASIC). This approach offers the low\npower and efficiency of an ASIC along with the ease of FPGA configuration,\nparticularly beneficial for the use case of machine learning in the data\npipeline of next-generation collider experiments. An open-source framework\ncalled \"FABulous\" was used to design eFPGAs using 130 nm and 28 nm CMOS\ntechnology nodes, which were subsequently fabricated and verified through\ntesting. The capability of an eFPGA to act as a front-end readout chip was\nassessed using simulation of high energy particles passing through a silicon\npixel sensor. A machine learning-based classifier, designed for reduction of\nsensor data at the source, was synthesized and configured onto the eFPGA. A\nsuccessful proof-of-concept was demonstrated through reproduction of the\nexpected algorithm result on the eFPGA with perfect accuracy. Further\ndevelopment of the eFPGA technology and its application to collider detector\nreadout is discussed.\n","authors":["Julia Gonski","Aseem Gupta","Haoyi Jia","Hyunjoon Kim","Lorenzo Rota","Larry Ruckman","Angelo Dragone","Ryan Herbst"],"pdf_url":"https://arxiv.org/pdf/2404.17701v3.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.02253v1","updated":"2024-07-02T13:18:15Z","published":"2024-07-02T13:18:15Z","title":"Parameter-Selective Continual Test-Time Adaptation","summary":" Continual Test-Time Adaptation (CTTA) aims to adapt a pretrained model to\never-changing environments during the test time under continuous domain shifts.\nMost existing CTTA approaches are based on the Mean Teacher (MT) structure,\nwhich contains a student and a teacher model, where the student is updated\nusing the pseudo-labels from the teacher model, and the teacher is then updated\nby exponential moving average strategy. However, these methods update the MT\nmodel indiscriminately on all parameters of the model. That is, some critical\nparameters involving sharing knowledge across different domains may be erased,\nintensifying error accumulation and catastrophic forgetting. In this paper, we\nintroduce Parameter-Selective Mean Teacher (PSMT) method, which is capable of\neffectively updating the critical parameters within the MT network under domain\nshifts. First, we introduce a selective distillation mechanism in the student\nmodel, which utilizes past knowledge to regularize novel knowledge, thereby\nmitigating the impact of error accumulation. Second, to avoid catastrophic\nforgetting, in the teacher model, we create a mask through Fisher information\nto selectively update parameters via exponential moving average, with\npreservation measures applied to crucial parameters. Extensive experimental\nresults verify that PSMT outperforms state-of-the-art methods across multiple\nbenchmark datasets. Our code is available at\n\\url{https://github.com/JiaxuTian/PSMT}.\n","authors":["Jiaxu Tian","Fan Lyu"],"pdf_url":"https://arxiv.org/pdf/2407.02253v1.pdf","comment":"17pages, 4 figures"},{"id":"http://arxiv.org/abs/2303.13093v4","updated":"2024-07-02T13:05:59Z","published":"2023-03-23T08:17:10Z","title":"Type-II Saddles and Probabilistic Stability of Stochastic Gradient\n Descent","summary":" Characterizing and understanding the dynamics of stochastic gradient descent\n(SGD) around saddle points remains an open problem. We first show that saddle\npoints in neural networks can be divided into two types, among which the\nType-II saddles are especially difficult to escape from because the gradient\nnoise vanishes at the saddle. The dynamics of SGD around these saddles are thus\nto leading order described by a random matrix product process, and it is thus\nnatural to study the dynamics of SGD around these saddles using the notion of\nprobabilistic stability and the related Lyapunov exponent. Theoretically, we\nlink the study of SGD dynamics to well-known concepts in ergodic theory, which\nwe leverage to show that saddle points can be either attractive or repulsive\nfor SGD, and its dynamics can be classified into four different phases,\ndepending on the signal-to-noise ratio in the gradient close to the saddle.\n","authors":["Liu Ziyin","Botao Li","Tomer Galanti","Masahito Ueda"],"pdf_url":"https://arxiv.org/pdf/2303.13093v4.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2407.02240v1","updated":"2024-07-02T13:02:12Z","published":"2024-07-02T13:02:12Z","title":"MALT Powers Up Adversarial Attacks","summary":" Current adversarial attacks for multi-class classifiers choose the target\nclass for a given input naively, based on the classifier's confidence levels\nfor various target classes. We present a novel adversarial targeting method,\n\\textit{MALT - Mesoscopic Almost Linearity Targeting}, based on medium-scale\nalmost linearity assumptions. Our attack wins over the current state of the art\nAutoAttack on the standard benchmark datasets CIFAR-100 and ImageNet and for a\nvariety of robust models. In particular, our attack is \\emph{five times faster}\nthan AutoAttack, while successfully matching all of AutoAttack's successes and\nattacking additional samples that were previously out of reach. We then prove\nformally and demonstrate empirically that our targeting method, although\ninspired by linear predictors, also applies to standard non-linear models.\n","authors":["Odelia Melamed","Gilad Yehudai","Adi Shamir"],"pdf_url":"https://arxiv.org/pdf/2407.02240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02238v1","updated":"2024-07-02T13:00:19Z","published":"2024-07-02T13:00:19Z","title":"MIREncoder: Multi-modal IR-based Pretrained Embeddings for Performance\n Optimizations","summary":" One of the primary areas of interest in High Performance Computing is the\nimprovement of performance of parallel workloads. Nowadays, compilable source\ncode-based optimization tasks that employ deep learning often exploit LLVM\nIntermediate Representations (IRs) for extracting features from source code.\nMost such works target specific tasks, or are designed with a pre-defined set\nof heuristics. So far, pre-trained models are rare in this domain, but the\npossibilities have been widely discussed. Especially approaches mimicking\nlarge-language models (LLMs) have been proposed. But these have prohibitively\nlarge training costs. In this paper, we propose MIREncoder, a M}ulti-modal\nIR-based Auto-Encoder that can be pre-trained to generate a learned embedding\nspace to be used for downstream tasks by machine learning-based approaches. A\nmulti-modal approach enables us to better extract features from compilable\nprograms. It allows us to better model code syntax, semantics and structure.\nFor code-based performance optimizations, these features are very important\nwhile making optimization decisions. A pre-trained model/embedding implicitly\nenables the usage of transfer learning, and helps move away from task-specific\ntrained models. Additionally, a pre-trained model used for downstream\nperformance optimization should itself have reduced overhead, and be easily\nusable. These considerations have led us to propose a modeling approach that i)\nunderstands code semantics and structure, ii) enables use of transfer learning,\nand iii) is small and simple enough to be easily re-purposed or reused even\nwith low resource availability. Our evaluations will show that our proposed\napproach can outperform the state of the art while reducing overhead.\n","authors":["Akash Dutta","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2407.02238v1.pdf","comment":"12 pages, 6 figures, 9 tables, PACT '24 conference"},{"id":"http://arxiv.org/abs/2407.02233v1","updated":"2024-07-02T12:57:42Z","published":"2024-07-02T12:57:42Z","title":"Synthetic Multimodal Question Generation","summary":" Multimodal Retrieval Augmented Generation (MMRAG) is a powerful approach to\nquestion-answering over multimodal documents. A key challenge with evaluating\nMMRAG is the paucity of high-quality datasets matching the question styles and\nmodalities of interest. In light of this, we propose SMMQG, a synthetic data\ngeneration framework. SMMQG leverages interplay between a retriever, large\nlanguage model (LLM) and large multimodal model (LMM) to generate question and\nanswer pairs directly from multimodal documents, with the questions conforming\nto specified styles and modalities. We use SMMQG to generate an MMRAG dataset\nof 1024 questions over Wikipedia documents and evaluate state-of-the-art models\nusing it, revealing insights into model performance that are attainable only\nthrough style- and modality-specific evaluation data. Next, we measure the\nquality of data produced by SMMQG via a human study. We find that the quality\nof our synthetic data is on par with the quality of the crowdsourced benchmark\nMMQA and that downstream evaluation results using both datasets strongly\nconcur.\n","authors":["Ian Wu","Sravan Jayanthi","Vijay Viswanathan","Simon Rosenberg","Sina Pakazad","Tongshuang Wu","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2407.02233v1.pdf","comment":"Submitted to ARR June 2024"},{"id":"http://arxiv.org/abs/2407.02231v1","updated":"2024-07-02T12:56:17Z","published":"2024-07-02T12:56:17Z","title":"Safety-Driven Deep Reinforcement Learning Framework for Cobots: A\n Sim2Real Approach","summary":" This study presents a novel methodology incorporating safety constraints into\na robotic simulation during the training of deep reinforcement learning (DRL).\nThe framework integrates specific parts of the safety requirements, such as\nvelocity constraints, as specified by ISO 10218, directly within the DRL model\nthat becomes a part of the robot's learning algorithm. The study then evaluated\nthe efficiency of these safety constraints by subjecting the DRL model to\nvarious scenarios, including grasping tasks with and without obstacle\navoidance. The validation process involved comprehensive simulation-based\ntesting of the DRL model's responses to potential hazards and its compliance.\nAlso, the performance of the system is carried out by the functional safety\nstandards IEC 61508 to determine the safety integrity level. The study\nindicated a significant improvement in the safety performance of the robotic\nsystem. The proposed DRL model anticipates and mitigates hazards while\nmaintaining operational efficiency. This study was validated in a testbed with\na collaborative robotic arm with safety sensors and assessed with metrics such\nas the average number of safety violations, obstacle avoidance, and the number\nof successful grasps. The proposed approach outperforms the conventional method\nby a 16.5% average success rate on the tested scenarios in the simulations and\n2.5% in the testbed without safety violations. The project repository is\navailable at https://github.com/ammar-n-abbas/sim2real-ur-gym-gazebo.\n","authors":["Ammar N. Abbas","Shakra Mehak","Georgios C. Chasparis","John D. Kelleher","Michael Guilfoyle","Maria Chiara Leva","Aswin K Ramasubramanian"],"pdf_url":"https://arxiv.org/pdf/2407.02231v1.pdf","comment":"This paper has been accepted for publication in the proceedings of\n the IEEE/IFAC International Conference on Control, Decision, and Information\n Technologies (CoDIT), 2024"},{"id":"http://arxiv.org/abs/2405.17666v2","updated":"2024-07-02T12:55:33Z","published":"2024-05-27T21:40:31Z","title":"Structured Partial Stochasticity in Bayesian Neural Networks","summary":" Bayesian neural network posterior distributions have a great number of modes\nthat correspond to the same network function. The abundance of such modes can\nmake it difficult for approximate inference methods to do their job. Recent\nwork has demonstrated the benefits of partial stochasticity for approximate\ninference in Bayesian neural networks; inference can be less costly and\nperformance can sometimes be improved. I propose a structured way to select the\ndeterministic subset of weights that removes neuron permutation symmetries, and\ntherefore the corresponding redundant posterior modes. With a drastically\nsimplified posterior distribution, the performance of existing approximate\ninference schemes is found to be greatly improved.\n","authors":["Tommy Rochussen"],"pdf_url":"https://arxiv.org/pdf/2405.17666v2.pdf","comment":"Accepted at 6th Symposium on Advances in Approximate Bayesian\n Inference (non-archival track)"},{"id":"http://arxiv.org/abs/2407.00463v2","updated":"2024-07-02T12:53:10Z","published":"2024-06-29T15:20:11Z","title":"Open-Source Conversational AI with SpeechBrain 1.0","summary":" SpeechBrain is an open-source Conversational AI toolkit based on PyTorch,\nfocused particularly on speech processing tasks such as speech recognition,\nspeech enhancement, speaker recognition, text-to-speech, and much more. It\npromotes transparency and replicability by releasing both the pre-trained\nmodels and the complete \"recipes\" of code and algorithms required for training\nthem. This paper presents SpeechBrain 1.0, a significant milestone in the\nevolution of the toolkit, which now has over 200 recipes for speech, audio, and\nlanguage processing tasks, and more than 100 models available on Hugging Face.\nSpeechBrain 1.0 introduces new technologies to support diverse learning\nmodalities, Large Language Model (LLM) integration, and advanced decoding\nstrategies, along with novel models, tasks, and modalities. It also includes a\nnew benchmark repository, offering researchers a unified platform for\nevaluating models across diverse tasks\n","authors":["Mirco Ravanelli","Titouan Parcollet","Adel Moumen","Sylvain de Langen","Cem Subakan","Peter Plantinga","Yingzhi Wang","Pooneh Mousavi","Luca Della Libera","Artem Ploujnikov","Francesco Paissan","Davide Borra","Salah Zaiem","Zeyu Zhao","Shucong Zhang","Georgios Karakasidis","Sung-Lin Yeh","Aku Rouhe","Rudolf Braun","Florian Mai","Juan Zuluaga-Gomez","Seyed Mahed Mousavi","Andreas Nautsch","Xuechen Liu","Sangeet Sagar","Jarod Duret","Salima Mdhaffar","Gaelle Laperriere","Renato De Mori","Yannick Esteve"],"pdf_url":"https://arxiv.org/pdf/2407.00463v2.pdf","comment":"Submitted to JMLR (Machine Learning Open Source Software)"},{"id":"http://arxiv.org/abs/2308.11038v2","updated":"2024-07-02T12:52:11Z","published":"2023-08-18T10:28:07Z","title":"Logistics Hub Location Optimization: A K-Means and P-Median Model Hybrid\n Approach Using Road Network Distances","summary":" Logistic hubs play a pivotal role in the last-mile delivery distance; even a\nslight increment in distance negatively impacts the business of the e-commerce\nindustry while also increasing its carbon footprint. The growth of this\nindustry, particularly after Covid-19, has further intensified the need for\noptimized allocation of resources in an urban environment. In this study, we\nuse a hybrid approach to optimize the placement of logistic hubs. The approach\nsequentially employs different techniques. Initially, delivery points are\nclustered using K-Means in relation to their spatial locations. The clustering\nmethod utilizes road network distances as opposed to Euclidean distances.\nNon-road network-based approaches have been avoided since they lead to\nerroneous and misleading results. Finally, hubs are located using the P-Median\nmethod. The P-Median method also incorporates the number of deliveries and\npopulation as weights. Real-world delivery data from Muller and Phipps (M&P) is\nused to demonstrate the effectiveness of the approach. Serving deliveries from\nthe optimal hub locations results in the saving of 815 (10%) meters per\ndelivery.\n","authors":["Muhammad Abdul Rahman","Muhammad Aamir Basheer","Zubair Khalid","Muhammad Tahir","Momin Uppal"],"pdf_url":"https://arxiv.org/pdf/2308.11038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04303v2","updated":"2024-07-02T12:39:46Z","published":"2024-06-06T17:49:21Z","title":"Vision-LSTM: xLSTM as Generic Vision Backbone","summary":" Transformers are widely used as generic backbones in computer vision, despite\ninitially introduced for natural language processing. Recently, the Long\nShort-Term Memory (LSTM) has been extended to a scalable and performant\narchitecture - the xLSTM - which overcomes long-standing LSTM limitations via\nexponential gating and parallelizable matrix memory structure. In this report,\nwe introduce Vision-LSTM (ViL), an adaption of the xLSTM building blocks to\ncomputer vision. ViL comprises a stack of xLSTM blocks where odd blocks process\nthe sequence of patch tokens from top to bottom while even blocks go from\nbottom to top. Experiments show that ViL holds promise to be further deployed\nas new generic backbone for computer vision architectures.\n","authors":["Benedikt Alkin","Maximilian Beck","Korbinian Pöppel","Sepp Hochreiter","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2406.04303v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01118v3","updated":"2024-07-02T12:39:41Z","published":"2023-08-02T12:58:11Z","title":"A Survey on Popularity Bias in Recommender Systems","summary":" Recommender systems help people find relevant content in a personalized way.\nOne main promise of such systems is that they are able to increase the\nvisibility of items in the long tail, i.e., the lesser-known items in a\ncatalogue. Existing research, however, suggests that in many situations todays\nrecommendation algorithms instead exhibit a popularity bias, meaning that they\noften focus on rather popular items in their recommendations. Such a bias may\nnot only lead to the limited value of the recommendations for consumers and\nproviders in the short run, but it may also cause undesired reinforcement\neffects over time. In this paper, we discuss the potential reasons for\npopularity bias and review existing approaches to detect, quantify and mitigate\npopularity bias in recommender systems. Our survey, therefore, includes both an\noverview of the computational metrics used in the literature as well as a\nreview of the main technical approaches to reduce the bias. Furthermore, we\ncritically discuss todays literature, where we observe that the research is\nalmost entirely based on computational experiments and on certain assumptions\nregarding the practical effects of including long-tail items in the\nrecommendations.\n","authors":["Anastasiia Klimashevskaia","Dietmar Jannach","Mehdi Elahi","Christoph Trattner"],"pdf_url":"https://arxiv.org/pdf/2308.01118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02217v1","updated":"2024-07-02T12:32:57Z","published":"2024-07-02T12:32:57Z","title":"Physics-Informed Model and Hybrid Planning for Efficient Dyna-Style\n Reinforcement Learning","summary":" Applying reinforcement learning (RL) to real-world applications requires\naddressing a trade-off between asymptotic performance, sample efficiency, and\ninference time. In this work, we demonstrate how to address this triple\nchallenge by leveraging partial physical knowledge about the system dynamics.\nOur approach involves learning a physics-informed model to boost sample\nefficiency and generating imaginary trajectories from this model to learn a\nmodel-free policy and Q-function. Furthermore, we propose a hybrid planning\nstrategy, combining the learned policy and Q-function with the learned model to\nenhance time efficiency in planning. Through practical demonstrations, we\nillustrate that our method improves the compromise between sample efficiency,\ntime efficiency, and performance over state-of-the-art methods.\n","authors":["Zakariae El Asri","Olivier Sigaud","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2407.02217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02211v1","updated":"2024-07-02T12:21:14Z","published":"2024-07-02T12:21:14Z","title":"PromptIntern: Saving Inference Costs by Internalizing Recurrent Prompt\n during Large Language Model Fine-tuning","summary":" Large language models (LLMs) have played a fundamental role in various\nnatural language processing tasks with powerful prompt techniques. However, in\nreal-world applications, there are often similar prompt components for repeated\nqueries, which causes significant computational burdens during inference.\nExisting prompt compression and direct fine-tuning methods aim to tackle these\nchallenges, yet they frequently struggle to strike an optimal balance between\ncost-efficiency and performance effectiveness, especially in complex tasks such\nas NL2Code. In this paper, we propose a novel method namely PromptIntern to\ninternalize the prompt knowledge into model parameters via progressive\nfine-tuning. Our method enables LLMs to emulate the human learning process for\na new task, where detailed templates and examples in a prompt are gradually\ninternalized and phased out progressively as the model grows accustomed to the\ntask. Extensive experiments demonstrate that our method reduces inference\ntokens over 90%, speedups inference by 4.2 times, and saves 88.3% monetary\ncost.\n","authors":["Jiaru Zou","Mengyu Zhou","Tao Li","Shi Han","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15897v2","updated":"2024-07-02T12:13:14Z","published":"2024-06-22T17:19:51Z","title":"Fusing Audio and Metadata Embeddings Improves Language-based Audio\n Retrieval","summary":" Matching raw audio signals with textual descriptions requires understanding\nthe audio's content and the description's semantics and then drawing\nconnections between the two modalities. This paper investigates a hybrid\nretrieval system that utilizes audio metadata as an additional clue to\nunderstand the content of audio signals before matching them with textual\nqueries. We experimented with metadata often attached to audio recordings, such\nas keywords and natural-language descriptions, and we investigated late and\nmid-level fusion strategies to merge audio and metadata. Our hybrid approach\nwith keyword metadata and late fusion improved the retrieval performance over a\ncontent-based baseline by 2.36 and 3.69 pp. mAP@10 on the ClothoV2 and\nAudioCaps benchmarks, respectively.\n","authors":["Paul Primus","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2406.15897v2.pdf","comment":"In Proceedings of the 32nd European Signal Processing Conference,\n EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2305.13865v3","updated":"2024-07-02T12:05:36Z","published":"2023-05-23T09:36:58Z","title":"Selective Pre-training for Private Fine-tuning","summary":" Text prediction models, when used in applications like email clients or word\nprocessors, must protect user data privacy and adhere to model size\nconstraints. These constraints are crucial to meet memory and inference time\nrequirements, as well as to reduce inference costs. Building small, fast, and\nprivate domain-specific language models is a thriving area of research. In this\nwork, we show that a careful pre-training on a \\emph{subset} of the public\ndataset that is guided by the private dataset is crucial to train small\nlanguage models with differential privacy. On standard benchmarks, small models\ntrained with our new framework achieve state-of-the-art performance. In\naddition to performance improvements, our results demonstrate that smaller\nmodels, through careful pre-training and private fine-tuning, can match the\nperformance of much larger models that do not have access to private data. This\nunderscores the potential of private learning for model compression and\nenhanced efficiency.\n","authors":["Da Yu","Sivakanth Gopi","Janardhan Kulkarni","Zinan Lin","Saurabh Naik","Tomasz Lukasz Religa","Jian Yin","Huishuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13865v3.pdf","comment":"Transactions on Machine Learning Research. Code available at\n https://github.com/dayu11/selective_pretraining_for_private_finetuning"},{"id":"http://arxiv.org/abs/2407.02191v1","updated":"2024-07-02T11:49:59Z","published":"2024-07-02T11:49:59Z","title":"Attack-Aware Noise Calibration for Differential Privacy","summary":" Differential privacy (DP) is a widely used approach for mitigating privacy\nrisks when training machine learning models on sensitive data. DP mechanisms\nadd noise during training to limit the risk of information leakage. The scale\nof the added noise is critical, as it determines the trade-off between privacy\nand utility. The standard practice is to select the noise scale in terms of a\nprivacy budget parameter $\\epsilon$. This parameter is in turn interpreted in\nterms of operational attack risk, such as accuracy, or sensitivity and\nspecificity of inference attacks against the privacy of the data. We\ndemonstrate that this two-step procedure of first calibrating the noise scale\nto a privacy budget $\\epsilon$, and then translating $\\epsilon$ to attack risk\nleads to overly conservative risk assessments and unnecessarily low utility. We\npropose methods to directly calibrate the noise scale to a desired attack risk\nlevel, bypassing the intermediate step of choosing $\\epsilon$. For a target\nattack risk, our approach significantly decreases noise scale, leading to\nincreased utility at the same level of privacy. We empirically demonstrate that\ncalibrating noise to attack sensitivity/specificity, rather than $\\epsilon$,\nwhen training privacy-preserving ML models substantially improves model\naccuracy for the same risk level. Our work provides a principled and practical\nway to improve the utility of privacy-preserving ML without compromising on\nprivacy.\n","authors":["Bogdan Kulynych","Juan Felipe Gomez","Georgios Kaissis","Flavio du Pin Calmon","Carmela Troncoso"],"pdf_url":"https://arxiv.org/pdf/2407.02191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02188v1","updated":"2024-07-02T11:46:07Z","published":"2024-07-02T11:46:07Z","title":"Structure-Aware Consensus Network on Graphs with Few Labeled Nodes","summary":" Graph node classification with few labeled nodes presents significant\nchallenges due to limited supervision. Conventional methods often exploit the\ngraph in a transductive learning manner. They fail to effectively utilize the\nabundant unlabeled data and the structural information inherent in graphs. To\naddress these issues, we introduce a Structure-Aware Consensus Network (SACN)\nfrom three perspectives. Firstly, SACN leverages a novel structure-aware\nconsensus learning strategy between two strongly augmented views. The proposed\nstrategy can fully exploit the potentially useful information of the unlabeled\nnodes and the structural information of the entire graph. Secondly, SACN\nuniquely integrates the graph's structural information to achieve\nstrong-to-strong consensus learning, improving the utilization of unlabeled\ndata while maintaining multiview learning. Thirdly, unlike two-branch graph\nneural network-based methods, SACN is designed for multiview feature learning\nwithin a single-branch architecture. Furthermore, a class-aware pseudolabel\nselection strategy helps address class imbalance and achieve effective\nweak-to-strong supervision. Extensive experiments on three benchmark datasets\ndemonstrate SACN's superior performance in node classification tasks,\nparticularly at very low label rates, outperforming state-of-the-art methods\nwhile maintaining computational simplicity.The source code is available at\nhttps://github.com/kunzhan/SACN\n","authors":["Shuaike Xu","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2407.02188v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2406.14953v2","updated":"2024-07-02T11:22:36Z","published":"2024-06-21T08:04:12Z","title":"Deep Imbalanced Regression to Estimate Vascular Age from PPG Data: a\n Novel Digital Biomarker for Cardiovascular Health","summary":" Photoplethysmography (PPG) is emerging as a crucial tool for monitoring human\nhemodynamics, with recent studies highlighting its potential in assessing\nvascular aging through deep learning. However, real-world age distributions are\noften imbalanced, posing significant challenges for deep learning models. In\nthis paper, we introduce a novel, simple, and effective loss function named the\nDist Loss to address deep imbalanced regression tasks. We trained a\none-dimensional convolutional neural network (Net1D) incorporating the Dist\nLoss on the extensive UK Biobank dataset (n=502,389) to estimate vascular age\nfrom PPG signals and validate its efficacy in characterizing cardiovascular\nhealth. The model's performance was validated on a 40% held-out test set,\nachieving state-of-the-art results, especially in regions with small sample\nsizes. Furthermore, we divided the population into three subgroups based on the\ndifference between predicted vascular age and chronological age: less than -10\nyears, between -10 and 10 years, and greater than 10 years. We analyzed the\nrelationship between predicted vascular age and several cardiovascular events\nover a follow-up period of up to 10 years, including death, coronary heart\ndisease, and heart failure. Our results indicate that the predicted vascular\nage has significant potential to reflect an individual's cardiovascular health\nstatus. Our code will be available at https://github.com/Ngk03/AI-vascular-age.\n","authors":["Guangkun Nie","Qinghao Zhao","Gongzheng Tang","Jun Li","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2406.14953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00063v2","updated":"2024-07-02T11:17:45Z","published":"2024-06-17T07:07:42Z","title":"An Interpretable Alternative to Neural Representation Learning for\n Rating Prediction -- Transparent Latent Class Modeling of User Reviews","summary":" Nowadays, neural network (NN) and deep learning (DL) techniques are widely\nadopted in many applications, including recommender systems. Given the sparse\nand stochastic nature of collaborative filtering (CF) data, recent works have\ncritically analyzed the effective improvement of neural-based approaches\ncompared to simpler and often transparent algorithms for recommendation.\nPrevious results showed that NN and DL models can be outperformed by\ntraditional algorithms in many tasks. Moreover, given the largely black-box\nnature of neural-based methods, interpretable results are not naturally\nobtained. Following on this debate, we first present a transparent\nprobabilistic model that topologically organizes user and product latent\nclasses based on the review information. In contrast to popular neural\ntechniques for representation learning, we readily obtain a statistical,\nvisualization-friendly tool that can be easily inspected to understand user and\nproduct characteristics from a textual-based perspective. Then, given the\nlimitations of common embedding techniques, we investigate the possibility of\nusing the estimated interpretable quantities as model input for a rating\nprediction task. To contribute to the recent debates, we evaluate our results\nin terms of both capacity for interpretability and predictive performances in\ncomparison with popular text-based neural approaches. The results demonstrate\nthat the proposed latent class representations can yield competitive predictive\nperformances, compared to popular, but difficult-to-interpret approaches.\n","authors":["Giuseppe Serra","Peter Tino","Zhao Xu","Xin Yao"],"pdf_url":"https://arxiv.org/pdf/2407.00063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.05192v2","updated":"2024-07-02T11:15:19Z","published":"2022-04-11T15:27:40Z","title":"Task-Synchronized Recurrent Neural Networks","summary":" Data are often sampled irregularly in time. Dealing with this using Recurrent\nNeural Networks (RNNs) traditionally involved ignoring the fact, feeding the\ntime differences as additional inputs, or resampling the data. All these\nmethods have their shortcomings. We propose an elegant straightforward\nalternative approach where instead the RNN is in effect resampled in time to\nmatch the time of the data or the task at hand. We use Echo State Network (ESN)\nand Gated Recurrent Unit (GRU) as the basis for our solution. Such RNNs can be\nseen as discretizations of continuous-time dynamical systems, which gives a\nsolid theoretical ground to our approach. Our Task-Synchronized ESN (TSESN) and\nGRU (TSGRU) models allow for a direct model time setting and require no\nadditional training, parameter tuning, or computation (solving differential\nequations or interpolating data) compared to their regular counterparts, thus\nretaining their original efficiency. We confirm empirically that our models can\neffectively compensate for the time-non-uniformity of the data and demonstrate\nthat they compare favorably to data resampling, classical RNN methods, and\nalternative RNN models proposed to deal with time irregularities on several\nreal-world nonuniform-time datasets. We open-source the code at\nhttps://github.com/oshapio/task-synchronized-RNNs .\n","authors":["Mantas Lukoševičius","Arnas Uselis"],"pdf_url":"https://arxiv.org/pdf/2204.05192v2.pdf","comment":"The 1st version was written in May 2019 and double-blind reviewed for\n a prominent conference. A major update. We changed the name of the article\n and methods to an arguably more precise one, and because a very similar title\n has been published in the meantime. We've rewritten much of the text,\n connected to the current literature, redone some experiments, figures,\n discussion, published source code"},{"id":"http://arxiv.org/abs/2305.17043v2","updated":"2024-07-02T10:58:23Z","published":"2023-05-26T15:52:08Z","title":"Explaining Deep Learning for ECG Analysis: Building Blocks for Auditing\n and Knowledge Discovery","summary":" Deep neural networks have become increasingly popular for analyzing ECG data\nbecause of their ability to accurately identify cardiac conditions and hidden\nclinical factors. However, the lack of transparency due to the black box nature\nof these models is a common concern. To address this issue, explainable AI\n(XAI) methods can be employed. In this study, we present a comprehensive\nanalysis of post-hoc XAI methods, investigating the local (attributions per\nsample) and global (based on domain expert concepts) perspectives. We have\nestablished a set of sanity checks to identify sensible attribution methods,\nand we provide quantitative evidence in accordance with expert rules. This\ndataset-wide analysis goes beyond anecdotal evidence by aggregating data across\npatient subgroups. Furthermore, we demonstrate how these XAI techniques can be\nutilized for knowledge discovery, such as identifying subtypes of myocardial\ninfarction. We believe that these proposed methods can serve as building blocks\nfor a complementary assessment of the internal validity during a certification\nprocess, as well as for knowledge discovery in the field of ECG analysis.\n","authors":["Patrick Wagner","Temesgen Mehari","Wilhelm Haverkamp","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2305.17043v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02156v1","updated":"2024-07-02T10:54:23Z","published":"2024-07-02T10:54:23Z","title":"Towards Training Music Taggers on Synthetic Data","summary":" Most contemporary music tagging systems rely on large volumes of annotated\ndata. As an alternative, we investigate the extent to which synthetically\ngenerated music excerpts can improve tagging systems when only small annotated\ncollections are available. To this end, we release GTZAN-synth, a synthetic\ndataset that follows the taxonomy of the well-known GTZAN dataset while being\nten times larger in data volume. We first observe that simply adding this\nsynthetic dataset to the training split of GTZAN does not result into\nperformance improvements. We then proceed to investigating domain adaptation,\ntransfer learning and fine-tuning strategies for the task at hand and draw the\nconclusion that the last two options yield an increase in accuracy. Overall,\nthe proposed approach can be considered as a first guide in a promising field\nfor future research.\n","authors":["Nadine Kroher","Steven Manangu","Aggelos Pikrakis"],"pdf_url":"https://arxiv.org/pdf/2407.02156v1.pdf","comment":"6 pages, 3 figures, accepted to 21st International Conference on\n Content-based Multimedia Indexing (CBMI) 2024, code available\n https://github.com/NadineKroher/music-tagging-synthetic-data-cbmi-2024"},{"id":"http://arxiv.org/abs/2202.08536v3","updated":"2024-07-02T10:53:59Z","published":"2022-02-17T09:26:39Z","title":"Are There Exceptions to Goodhart's Law? On the Moral Justification of\n Fairness-Aware Machine Learning","summary":" Fairness-aware machine learning (fair-ml) techniques are algorithmic\ninterventions designed to ensure that individuals who are affected by the\npredictions of a machine learning model are treated fairly. The problem is\noften posed as an optimization problem, where the objective is to achieve high\npredictive performance under a quantitative fairness constraint. However, any\nattempt to design a fair-ml algorithm must assume a world where Goodhart's law\nhas an exception: when a fairness measure becomes an optimization constraint,\nit does not cease to be a good measure. In this paper, we argue that fairness\nmeasures are particularly sensitive to Goodhart's law. Our main contributions\nare as follows. First, we present a framework for moral reasoning about the\njustification of fairness metrics. In contrast to existing work, our framework\nincorporates the belief that whether a distribution of outcomes is fair,\ndepends not only on the cause of inequalities but also on what moral claims\ndecision subjects have to receive a particular benefit or avoid a burden. We\nuse the framework to distil moral and empirical assumptions under which\nparticular fairness metrics correspond to a fair distribution of outcomes.\nSecond, we explore the extent to which employing fairness metrics as a\nconstraint in a fair-ml algorithm is morally justifiable, exemplified by the\nfair-ml algorithm introduced by Hardt et al. (2016). We illustrate that\nenforcing a fairness metric through a fair-ml algorithm often does not result\nin the fair distribution of outcomes that motivated its use and can even harm\nthe individuals the intervention was intended to protect.\n","authors":["Hilde Weerts","Lambèr Royakkers","Mykola Pechenizkiy"],"pdf_url":"https://arxiv.org/pdf/2202.08536v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02153v1","updated":"2024-07-02T10:51:36Z","published":"2024-07-02T10:51:36Z","title":"Equidistribution-based training of Free Knot Splines and ReLU Neural\n Networks","summary":" We consider the problem of one-dimensional function approximation using\nshallow neural networks (NN) with a rectified linear unit (ReLU) activation\nfunction and compare their training with traditional methods such as univariate\nFree Knot Splines (FKS). ReLU NNs and FKS span the same function space, and\nthus have the same theoretical expressivity. In the case of ReLU NNs, we show\nthat their ill-conditioning degrades rapidly as the width of the network\nincreases. This often leads to significantly poorer approximation in contrast\nto the FKS representation, which remains well-conditioned as the number of\nknots increases. We leverage the theory of optimal piecewise linear\ninterpolants to improve the training procedure for a ReLU NN. Using the\nequidistribution principle, we propose a two-level procedure for training the\nFKS by first solving the nonlinear problem of finding the optimal knot\nlocations of the interpolating FKS. Determining the optimal knots then acts as\na good starting point for training the weights of the FKS. The training of the\nFKS gives insights into how we can train a ReLU NN effectively to give an\nequally accurate approximation. More precisely, we combine the training of the\nReLU NN with an equidistribution based loss to find the breakpoints of the ReLU\nfunctions, combined with preconditioning the ReLU NN approximation (to take an\nFKS form) to find the scalings of the ReLU functions, leads to a\nwell-conditioned and reliable method of finding an accurate ReLU NN\napproximation to a target function. We test this method on a series or regular,\nsingular, and rapidly varying target functions and obtain good results\nrealising the expressivity of the network in this case.\n","authors":["Simone Appella","Simon Arridge","Chris Budd","Teo Deveney","Lisa Maria Kreusser"],"pdf_url":"https://arxiv.org/pdf/2407.02153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02143v1","updated":"2024-07-02T10:37:54Z","published":"2024-07-02T10:37:54Z","title":"Counterfactual Data Augmentation with Denoising Diffusion for Graph\n Anomaly Detection","summary":" A critical aspect of Graph Neural Networks (GNNs) is to enhance the node\nrepresentations by aggregating node neighborhood information. However, when\ndetecting anomalies, the representations of abnormal nodes are prone to be\naveraged by normal neighbors, making the learned anomaly representations less\ndistinguishable. To tackle this issue, we propose CAGAD -- an unsupervised\nCounterfactual data Augmentation method for Graph Anomaly Detection -- which\nintroduces a graph pointer neural network as the heterophilic node detector to\nidentify potential anomalies whose neighborhoods are normal-node-dominant. For\neach identified potential anomaly, we design a graph-specific diffusion model\nto translate a part of its neighbors, which are probably normal, into anomalous\nones. At last, we involve these translated neighbors in GNN neighborhood\naggregation to produce counterfactual representations of anomalies. Through\naggregating the translated anomalous neighbors, counterfactual representations\nbecome more distinguishable and further advocate detection performance. The\nexperimental results on four datasets demonstrate that CAGAD significantly\noutperforms strong baselines, with an average improvement of 2.35% on F1, 2.53%\non AUC-ROC, and 2.79% on AUC-PR.\n","authors":["Chunjing Xiao","Shikang Pang","Xovee Xu","Xuan Li","Goce Trajcevski","Fan Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02143v1.pdf","comment":"Accepted by IEEE Transactions on Computational Social Systems(TCSS).\n DOI: https://doi.org/10.1109/TCSS.2024.3403503"},{"id":"http://arxiv.org/abs/2407.02138v1","updated":"2024-07-02T10:33:31Z","published":"2024-07-02T10:33:31Z","title":"Efficient Nearest Neighbor based Uncertainty Estimation for Natural\n Language Processing Tasks","summary":" Trustworthy prediction in Deep Neural Networks (DNNs), including Pre-trained\nLanguage Models (PLMs) is important for safety-critical applications in the\nreal world. However, DNNs often suffer from uncertainty estimation, such as\nmiscalibration. In particular, approaches that require multiple stochastic\ninference can mitigate this problem, but the expensive cost of inference makes\nthem impractical. In this study, we propose $k$-Nearest Neighbor Uncertainty\nEstimation ($k$NN-UE), which is an uncertainty estimation method that uses the\ndistances from the neighbors and label-existence ratio of neighbors.\nExperiments on sentiment analysis, natural language inference, and named entity\nrecognition show that our proposed method outperforms the baselines or recent\ndensity-based methods in confidence calibration, selective prediction, and\nout-of-distribution detection. Moreover, our analyses indicate that introducing\ndimension reduction or approximate nearest neighbor search inspired by recent\n$k$NN-LM studies reduces the inference overhead without significantly degrading\nestimation performance when combined them appropriately.\n","authors":["Wataru Hashimoto","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.02138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02125v1","updated":"2024-07-02T10:16:04Z","published":"2024-07-02T10:16:04Z","title":"Distributional Regression U-Nets for the Postprocessing of Precipitation\n Ensemble Forecasts","summary":" Accurate precipitation forecasts have a high socio-economic value due to\ntheir role in decision-making in various fields such as transport networks and\nfarming. We propose a global statistical postprocessing method for grid-based\nprecipitation ensemble forecasts. This U-Net-based distributional regression\nmethod predicts marginal distributions in the form of parametric distributions\ninferred by scoring rule minimization. Distributional regression U-Nets are\ncompared to state-of-the-art postprocessing methods for daily 21-h forecasts of\n3-h accumulated precipitation over the South of France. Training data comes\nfrom the M\\'et\\'eo-France weather model AROME-EPS and spans 3 years. A\npractical challenge appears when consistent data or reforecasts are not\navailable.\n Distributional regression U-Nets compete favorably with the raw ensemble. In\nterms of continuous ranked probability score, they reach a performance\ncomparable to quantile regression forests (QRF). However, they are unable to\nprovide calibrated forecasts in areas associated with high climatological\nprecipitation. In terms of predictive power for heavy precipitation events,\nthey outperform both QRF and semi-parametric QRF with tail extensions.\n","authors":["Romain Pic","Clément Dombry","Philippe Naveau","Maxime Taillardat"],"pdf_url":"https://arxiv.org/pdf/2407.02125v1.pdf","comment":"for associated code, see https://github.com/pic-romain/unet-pp"},{"id":"http://arxiv.org/abs/2407.02119v1","updated":"2024-07-02T10:09:19Z","published":"2024-07-02T10:09:19Z","title":"Cost-Effective Proxy Reward Model Construction with On-Policy and Active\n Learning","summary":" Reinforcement learning with human feedback (RLHF), as a widely adopted\napproach in current large language model pipelines, is \\textit{bottlenecked by\nthe size of human preference data}. While traditional methods rely on offline\npreference dataset constructions, recent approaches have shifted towards online\nsettings, where a learner uses a small amount of labeled seed data and a large\npool of unlabeled prompts to iteratively construct new preference data through\nself-generated responses and high-quality reward/preference feedback. However,\nmost current online algorithms still focus on preference labeling during policy\nmodel updating with given feedback oracles, which incurs significant expert\nquery costs. \\textit{We are the first to explore cost-effective proxy reward\noracles construction strategies for further labeling preferences or rewards\nwith extremely limited labeled data and expert query budgets}. Our approach\nintroduces two key innovations: (1) on-policy query to avoid OOD and imbalance\nissues in seed data, and (2) active learning to select the most informative\ndata for preference queries. Using these methods, we train a evaluation model\nwith minimal expert-labeled data, which then effectively labels nine times more\npreference pairs for further RLHF training. For instance, our model using\nDirect Preference Optimization (DPO) gains around over 1% average improvement\non AlpacaEval2, MMLU-5shot and MMLU-0shot, with only 1.7K query cost. Our\nmethodology is orthogonal to other direct expert query-based strategies and\ntherefore might be integrated with them to further reduce query costs.\n","authors":["Yifang Chen","Shuohang Wang","Ziyi Yang","Hiteshi Sharma","Nikos Karampatziakis","Donghan Yu","Kevin Jamieson","Simon Shaolei Du","Yelong Shen"],"pdf_url":"https://arxiv.org/pdf/2407.02119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10800v3","updated":"2024-07-02T10:06:43Z","published":"2024-01-19T16:36:27Z","title":"Estimation of AMOC transition probabilities using a machine learning\n based rare-event algorithm","summary":" The Atlantic Meridional Overturning Circulation (AMOC) is an important\ncomponent of the global climate, known to be a tipping element, as it could\ncollapse under global warming. The main objective of this study is to compute\nthe probability that the AMOC collapses within a specified time window, using a\nrare-event algorithm called Trajectory-Adaptive Multilevel Splitting (TAMS).\nHowever, the efficiency and accuracy of TAMS depend on the choice of the score\nfunction. Although the definition of the optimal score function, called\n``committor function\" is known, it is impossible in general to compute it a\npriori. Here, we combine TAMS with a Next-Generation Reservoir Computing\ntechnique that estimates the committor function from the data generated by the\nrare-event algorithm. We test this technique in a stochastic box model of the\nAMOC for which two types of transition exist, the so-called F(ast)-transitions\nand S(low)-transitions. Results for the F-transtions compare favorably with\nthose in the literature where a physically-informed score function was used. We\nshow that coupling a rare-event algorithm with machine learning allows for a\ncorrect estimation of transition probabilities, transition times, and even\ntransition paths for a wide range of model parameters. We then extend these\nresults to the more difficult problem of S-transitions in the same model. In\nboth cases of F-transitions and S-transitions, we also show how the\nNext-Generation Reservoir Computing technique can be interpreted to retrieve an\nanalytical estimate of the committor function.\n","authors":["Valérian Jacques-Dumas","René M. van Westen","Henk A. Dijkstra"],"pdf_url":"https://arxiv.org/pdf/2401.10800v3.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.02112v1","updated":"2024-07-02T09:54:39Z","published":"2024-07-02T09:54:39Z","title":"A Data-Centric Perspective on Evaluating Machine Learning Models for\n Tabular Data","summary":" Tabular data is prevalent in real-world machine learning applications, and\nnew models for supervised learning of tabular data are frequently proposed.\nComparative studies assessing the performance of models typically consist of\nmodel-centric evaluation setups with overly standardized data preprocessing.\nThis paper demonstrates that such model-centric evaluations are biased, as\nreal-world modeling pipelines often require dataset-specific preprocessing and\nfeature engineering. Therefore, we propose a data-centric evaluation framework.\nWe select 10 relevant datasets from Kaggle competitions and implement\nexpert-level preprocessing pipelines for each dataset. We conduct experiments\nwith different preprocessing pipelines and hyperparameter optimization (HPO)\nregimes to quantify the impact of model selection, HPO, feature engineering,\nand test-time adaptation. Our main findings are: 1. After dataset-specific\nfeature engineering, model rankings change considerably, performance\ndifferences decrease, and the importance of model selection reduces. 2. Recent\nmodels, despite their measurable progress, still significantly benefit from\nmanual feature engineering. This holds true for both tree-based models and\nneural networks. 3. While tabular data is typically considered static, samples\nare often collected over time, and adapting to distribution shifts can be\nimportant even in supposedly static data. These insights suggest that research\nefforts should be directed toward a data-centric perspective, acknowledging\nthat tabular data requires feature engineering and often exhibits temporal\ncharacteristics.\n","authors":["Andrej Tschalzev","Sascha Marton","Stefan Lüdtke","Christian Bartelt","Heiner Stuckenschmidt"],"pdf_url":"https://arxiv.org/pdf/2407.02112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02106v1","updated":"2024-07-02T09:47:56Z","published":"2024-07-02T09:47:56Z","title":"Automated Knowledge Graph Learning in Industrial Processes","summary":" Industrial processes generate vast amounts of time series data, yet\nextracting meaningful relationships and insights remains challenging. This\npaper introduces a framework for automated knowledge graph learning from time\nseries data, specifically tailored for industrial applications. Our framework\naddresses the complexities inherent in industrial datasets, transforming them\ninto knowledge graphs that improve decision-making, process optimization, and\nknowledge discovery. Additionally, it employs Granger causality to identify key\nattributes that can inform the design of predictive models. To illustrate the\npractical utility of our approach, we also present a motivating use case\ndemonstrating the benefits of our framework in a real-world industrial\nscenario. Further, we demonstrate how the automated conversion of time series\ndata into knowledge graphs can identify causal influences or dependencies\nbetween important process parameters.\n","authors":["Lolitta Ammann","Jorge Martinez-Gil","Michael Mayr","Georgios C. Chasparis"],"pdf_url":"https://arxiv.org/pdf/2407.02106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01210v5","updated":"2024-07-02T09:31:04Z","published":"2023-10-02T13:55:06Z","title":"Towards Robust Cardiac Segmentation using Graph Convolutional Networks","summary":" Fully automatic cardiac segmentation can be a fast and reproducible method to\nextract clinical measurements from an echocardiography examination. The U-Net\narchitecture is the current state-of-the-art deep learning architecture for\nmedical segmentation and can segment cardiac structures in real-time with\naverage errors comparable to inter-observer variability. However, this\narchitecture still generates large outliers that are often anatomically\nincorrect. This work uses the concept of graph convolutional neural networks\nthat predict the contour points of the structures of interest instead of\nlabeling each pixel. We propose a graph architecture that uses two\nconvolutional rings based on cardiac anatomy and show that this eliminates\nanatomical incorrect multi-structure segmentations on the publicly available\nCAMUS dataset. Additionally, this work contributes with an ablation study on\nthe graph convolutional architecture and an evaluation of clinical measurements\non the clinical HUNT4 dataset. Finally, we propose to use the inter-model\nagreement of the U-Net and the graph network as a predictor of both the input\nand segmentation quality. We show this predictor can detect out-of-distribution\nand unsuitable input images in real-time. Source code is available online:\nhttps://github.com/gillesvntnu/GCN_multistructure\n","authors":["Gilles Van De Vyver","Sarina Thomas","Guy Ben-Yosef","Sindre Hellum Olaisen","Håvard Dalen","Lasse Løvstakken","Erik Smistad"],"pdf_url":"https://arxiv.org/pdf/2310.01210v5.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.02091v1","updated":"2024-07-02T09:26:38Z","published":"2024-07-02T09:26:38Z","title":"Efficient Bit Labeling in Factorization Machines with Annealing for\n Traveling Salesman Problem","summary":" To efficiently find an optimum parameter combination in a large-scale\nproblem, it is a key to convert the parameters into available variables in\nactual machines. Specifically, quadratic unconstrained binary optimization\nproblems are solved with the help of machine learning, e.g., factorization\nmachines with annealing, which convert a raw parameter to binary variables.\nThis work investigates the dependence of the convergence speed and the accuracy\non binary labeling method, which can influence the cost function shape and thus\nthe probability of being captured at a local minimum solution. By exemplifying\ntraveling salesman problem, we propose and evaluate Gray labeling, which\ncorrelates the Hamming distance in binary labels with the traveling distance.\nThrough numerical simulation of traveling salesman problem up to 15 cities at a\nlimited number of iterations, the Gray labeling shows less local minima\npercentages and shorter traveling distances compared with natural labeling.\n","authors":["Shota Koshikawa","Aruto Hosaka","Tsuyoshi Yoshida"],"pdf_url":"https://arxiv.org/pdf/2407.02091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02089v1","updated":"2024-07-02T09:25:58Z","published":"2024-07-02T09:25:58Z","title":"GPTCast: a weather language model for precipitation nowcasting","summary":" This work introduces GPTCast, a generative deep-learning method for ensemble\nnowcast of radar-based precipitation, inspired by advancements in large\nlanguage models (LLMs). We employ a GPT model as a forecaster to learn\nspatiotemporal precipitation dynamics using tokenized radar images. The\ntokenizer is based on a Quantized Variational Autoencoder featuring a novel\nreconstruction loss tailored for the skewed distribution of precipitation that\npromotes faithful reconstruction of high rainfall rates. The approach produces\nrealistic ensemble forecasts and provides probabilistic outputs with accurate\nuncertainty estimation. The model is trained without resorting to randomness,\nall variability is learned solely from the data and exposed by model at\ninference for ensemble generation. We train and test GPTCast using a 6-year\nradar dataset over the Emilia-Romagna region in Northern Italy, showing\nsuperior results compared to state-of-the-art ensemble extrapolation methods.\n","authors":["Gabriele Franch","Elena Tomasi","Rishabh Wanjari","Virginia Poli","Chiara Cardinali","Pier Paolo Alberoni","Marco Cristoforetti"],"pdf_url":"https://arxiv.org/pdf/2407.02089v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.12381v2","updated":"2024-07-02T09:21:03Z","published":"2024-06-18T08:09:58Z","title":"QOG:Question and Options Generation based on Language Model","summary":" Question-Options Generation (QOG) is a task that involves generating a set of\nquestion-options pairs given context. This task has various applications,\nincluding fine-tuning large models, information retrieval, and automated\nmultiple-choice question generation for education. In this paper, we develop\nQOG models using three different methods based on fine-tuning\nsequence-to-sequence language models (LMs). Experiments demonstrate that the\nend-to-end QOG model is computationally efficient and stable during both\ntraining and inference, outperforming other methods. Furthermore, our analysis\nindicates that our QOG models are competitive on the QOG task compared to the\nlarge language model Llama 3-8B.\n","authors":["Jincheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.12381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00592v3","updated":"2024-07-02T09:09:19Z","published":"2023-12-01T13:56:28Z","title":"Tracking Object Positions in Reinforcement Learning: A Metric for\n Keypoint Detection (extended version)","summary":" Reinforcement learning (RL) for robot control typically requires a detailed\nrepresentation of the environment state, including information about\ntask-relevant objects not directly measurable. Keypoint detectors, such as\nspatial autoencoders (SAEs), are a common approach to extracting a\nlow-dimensional representation from high-dimensional image data. SAEs aim at\nspatial features such as object positions, which are often useful\nrepresentations in robotic RL. However, whether an SAE is actually able to\ntrack objects in the scene and thus yields a spatial state representation well\nsuited for RL tasks has rarely been examined due to a lack of established\nmetrics. In this paper, we propose to assess the performance of an SAE instance\nby measuring how well keypoints track ground truth objects in images. We\npresent a computationally lightweight metric and use it to evaluate common\nbaseline SAE architectures on image data from a simulated robot task. We find\nthat common SAEs differ substantially in their spatial extraction capability.\nFurthermore, we validate that SAEs that perform well in our metric achieve\nsuperior performance when used in downstream RL. Thus, our metric is an\neffective and lightweight indicator of RL performance before executing\nexpensive RL training. Building on these insights, we identify three key\nmodifications of SAE architectures to improve tracking performance.\n","authors":["Emma Cramer","Jonas Reiher","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2312.00592v3.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.02073v1","updated":"2024-07-02T09:05:43Z","published":"2024-07-02T09:05:43Z","title":"Contribution Evaluation of Heterogeneous Participants in Federated\n Learning via Prototypical Representations","summary":" Contribution evaluation in federated learning (FL) has become a pivotal\nresearch area due to its applicability across various domains, such as\ndetecting low-quality datasets, enhancing model robustness, and designing\nincentive mechanisms. Existing contribution evaluation methods, which primarily\nrely on data volume, model similarity, and auxiliary test datasets, have shown\nsuccess in diverse scenarios. However, their effectiveness often diminishes due\nto the heterogeneity of data distributions, presenting a significant challenge\nto their applicability. In response, this paper explores contribution\nevaluation in FL from an entirely new perspective of representation. In this\nwork, we propose a new method for the contribution evaluation of heterogeneous\nparticipants in federated learning (FLCE), which introduces a novel indicator\n\\emph{class contribution momentum} to conduct refined contribution evaluation.\nOur core idea is the construction and application of the class contribution\nmomentum indicator from individual, relative, and holistic perspectives,\nthereby achieving an effective and efficient contribution evaluation of\nheterogeneous participants without relying on an auxiliary test dataset.\nExtensive experimental results demonstrate the superiority of our method in\nterms of fidelity, effectiveness, efficiency, and heterogeneity across various\nscenarios.\n","authors":["Qi Guo","Minghao Yao","Zhen Tian","Saiyu Qi","Yong Qi","Yun Lin","Jin Song Dong"],"pdf_url":"https://arxiv.org/pdf/2407.02073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07719v5","updated":"2024-07-02T09:03:26Z","published":"2024-05-13T13:08:02Z","title":"USP: A Unified Sequence Parallelism Approach for Long Context Generative\n AI","summary":" Sequence parallelism (SP), which divides the sequence dimension of input\ntensors across multiple computational devices, is becoming key to unlocking the\nlong-context capabilities of generative AI models. This paper investigates the\nstate-of-the-art SP approaches, i.e. DeepSpeed-Ulysses and Ring-Attention, and\nproposes a unified SP approach, which is more robust to transformer model\narchitectures and network hardware topology. This paper compares the\ncommunication and memory cost of SP and existing parallelism, including\ndata/tensor/zero/pipeline parallelism, and discusses the best practices for\ndesigning hybrid 4D parallelism involving SP. We achieved 47% MFU on two 8xA800\nnodes using SP for the LLAMA3-8B model training using sequence length 208K. Our\ncode is publicly available at\nhttps://github.com/feifeibear/long-context-attention.\n","authors":["Jiarui Fang","Shangchun Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.07719v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02070v1","updated":"2024-07-02T08:59:24Z","published":"2024-07-02T08:59:24Z","title":"Latent Diffusion Model for Generating Ensembles of Climate Simulations","summary":" Obtaining accurate estimates of uncertainty in climate scenarios often\nrequires generating large ensembles of high-resolution climate simulations, a\ncomputationally expensive and memory intensive process. To address this\nchallenge, we train a novel generative deep learning approach on extensive sets\nof climate simulations. The model consists of two components: a variational\nautoencoder for dimensionality reduction and a denoising diffusion\nprobabilistic model that generates multiple ensemble members. We validate our\nmodel on the Max Planck Institute Grand Ensemble and show that it achieves good\nagreement with the original ensemble in terms of variability. By leveraging the\nlatent space representation, our model can rapidly generate large ensembles\non-the-fly with minimal memory requirements, which can significantly improve\nthe efficiency of uncertainty quantification in climate simulations.\n","authors":["Johannes Meuer","Maximilian Witte","Claudia Timmreck","Thomas Ludwig","Christopher Kadow"],"pdf_url":"https://arxiv.org/pdf/2407.02070v1.pdf","comment":"8 pages, 7 figures, Accepted at the ICML 2024 Machine Learning for\n Earth System Modeling workshop"},{"id":"http://arxiv.org/abs/2406.18382v2","updated":"2024-07-02T08:56:48Z","published":"2024-06-26T14:24:51Z","title":"Adversarial Search Engine Optimization for Large Language Models","summary":" Large Language Models (LLMs) are increasingly used in applications where the\nmodel selects from competing third-party content, such as in LLM-powered search\nengines or chatbot plugins. In this paper, we introduce Preference Manipulation\nAttacks, a new class of attacks that manipulate an LLM's selections to favor\nthe attacker. We demonstrate that carefully crafted website content or plugin\ndocumentations can trick an LLM to promote the attacker products and discredit\ncompetitors, thereby increasing user traffic and monetization. We show this\nleads to a prisoner's dilemma, where all parties are incentivized to launch\nattacks, but the collective effect degrades the LLM's outputs for everyone. We\ndemonstrate our attacks on production LLM search engines (Bing and Perplexity)\nand plugin APIs (for GPT-4 and Claude). As LLMs are increasingly used to rank\nthird-party content, we expect Preference Manipulation Attacks to emerge as a\nsignificant threat.\n","authors":["Fredrik Nestaas","Edoardo Debenedetti","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2406.18382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16958v2","updated":"2024-07-02T08:53:09Z","published":"2024-04-25T18:12:43Z","title":"A Closer Look at Classification Evaluation Metrics and a Critical\n Reflection of Common Evaluation Practice","summary":" Classification systems are evaluated in a countless number of papers.\nHowever, we find that evaluation practice is often nebulous. Frequently,\nmetrics are selected without arguments, and blurry terminology invites\nmisconceptions. For instance, many works use so-called 'macro' metrics to rank\nsystems (e.g., 'macro F1') but do not clearly specify what they would expect\nfrom such a `macro' metric. This is problematic, since picking a metric can\naffect research findings, and thus any clarity in the process should be\nmaximized.\n Starting from the intuitive concepts of bias and prevalence, we perform an\nanalysis of common evaluation metrics. The analysis helps us understand the\nmetrics' underlying properties, and how they align with expectations as found\nexpressed in papers. Then we reflect on the practical situation in the field,\nand survey evaluation practice in recent shared tasks. We find that metric\nselection is often not supported with convincing arguments, an issue that can\nmake a system ranking seem arbitrary. Our work aims at providing overview and\nguidance for more informed and transparent metric selection, fostering\nmeaningful evaluation.\n","authors":["Juri Opitz"],"pdf_url":"https://arxiv.org/pdf/2404.16958v2.pdf","comment":"appeared in TACL journal. MIT press publication available at\n https://doi.org/10.1162/tacl_a_00675"},{"id":"http://arxiv.org/abs/2106.15775v2","updated":"2024-07-02T08:53:08Z","published":"2021-06-30T02:07:39Z","title":"Koopman Spectrum Nonlinear Regulators and Efficient Online Learning","summary":" Most modern reinforcement learning algorithms optimize a cumulative\nsingle-step cost along a trajectory. The optimized motions are often\n'unnatural', representing, for example, behaviors with sudden accelerations\nthat waste energy and lack predictability. In this work, we present a novel\nparadigm of controlling nonlinear systems via the minimization of the Koopman\nspectrum cost: a cost over the Koopman operator of the controlled dynamics.\nThis induces a broader class of dynamical behaviors that evolve over stable\nmanifolds such as nonlinear oscillators, closed loops, and smooth movements. We\ndemonstrate that some dynamics characterizations that are not possible with a\ncumulative cost are feasible in this paradigm, which generalizes the classical\neigenstructure and pole assignments to nonlinear decision making. Moreover, we\npresent a sample efficient online learning algorithm for our problem that\nenjoys a sub-linear regret bound under some structural assumptions.\n","authors":["Motoya Ohnishi","Isao Ishikawa","Kendall Lowrey","Masahiro Ikeda","Sham Kakade","Yoshinobu Kawahara"],"pdf_url":"https://arxiv.org/pdf/2106.15775v2.pdf","comment":"41 pages, 21 figures"},{"id":"http://arxiv.org/abs/2407.02062v1","updated":"2024-07-02T08:49:43Z","published":"2024-07-02T08:49:43Z","title":"Are Data Augmentation Methods in Named Entity Recognition Applicable for\n Uncertainty Estimation?","summary":" This work investigates the impact of data augmentation on confidence\ncalibration and uncertainty estimation in Named Entity Recognition (NER) tasks.\nFor the future advance of NER in safety-critical fields like healthcare and\nfinance, it is essential to achieve accurate predictions with calibrated\nconfidence when applying Deep Neural Networks (DNNs), including Pre-trained\nLanguage Models (PLMs), as a real-world application. However, DNNs are prone to\nmiscalibration, which limits their applicability. Moreover, existing methods\nfor calibration and uncertainty estimation are computational expensive. Our\ninvestigation in NER found that data augmentation improves calibration and\nuncertainty in cross-genre and cross-lingual setting, especially in-domain\nsetting. Furthermore, we showed that the calibration for NER tends to be more\neffective when the perplexity of the sentences generated by data augmentation\nis lower, and that increasing the size of the augmentation further improves\ncalibration and uncertainty.\n","authors":["Wataru Hashimoto","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.02062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02060v1","updated":"2024-07-02T08:45:38Z","published":"2024-07-02T08:45:38Z","title":"Terminating Differentiable Tree Experts","summary":" We advance the recently proposed neuro-symbolic Differentiable Tree Machine,\nwhich learns tree operations using a combination of transformers and Tensor\nProduct Representations. We investigate the architecture and propose two key\ncomponents. We first remove a series of different transformer layers that are\nused in every step by introducing a mixture of experts. This results in a\nDifferentiable Tree Experts model with a constant number of parameters for any\narbitrary number of steps in the computation, compared to the previous method\nin the Differentiable Tree Machine with a linear growth. Given this flexibility\nin the number of steps, we additionally propose a new termination algorithm to\nprovide the model the power to choose how many steps to make automatically. The\nresulting Terminating Differentiable Tree Experts model sluggishly learns to\npredict the number of steps without an oracle. It can do so while maintaining\nthe learning capabilities of the model, converging to the optimal amount of\nsteps.\n","authors":["Jonathan Thomm","Michael Hersche","Giacomo Camposampiero","Aleksandar Terzić","Bernhard Schölkopf","Abbas Rahimi"],"pdf_url":"https://arxiv.org/pdf/2407.02060v1.pdf","comment":"Accepted at the 18th International Conference on Neural-Symbolic\n Learning and Reasoning (NeSy) 2024"},{"id":"http://arxiv.org/abs/2407.02057v1","updated":"2024-07-02T08:38:32Z","published":"2024-07-02T08:38:32Z","title":"HC-GLAD: Dual Hyperbolic Contrastive Learning for Unsupervised\n Graph-Level Anomaly Detection","summary":" Unsupervised graph-level anomaly detection (UGAD) has garnered increasing\nattention in recent years due to its significance. However, most existing\nmethods only rely on traditional graph neural networks to explore pairwise\nrelationships but such kind of pairwise edges are not enough to describe\nmultifaceted relationships involving anomaly. There is an emergency need to\nexploit node group information which plays a crucial role in UGAD. In addition,\nmost previous works ignore the global underlying properties (e.g., hierarchy\nand power-law structure) which are common in real-world graph datasets and\ntherefore are indispensable factors on UGAD task. In this paper, we propose a\nnovel Dual Hyperbolic Contrastive Learning for Unsupervised Graph-Level Anomaly\nDetection (HC-GLAD in short). To exploit node group connections, we construct\nhypergraphs based on gold motifs and subsequently perform hypergraph\nconvolution. Furthermore, to preserve the hierarchy of real-world graphs, we\nintroduce hyperbolic geometry into this field and conduct both graph and\nhypergraph embedding learning in hyperbolic space with hyperboloid model. To\nthe best of our knowledge, this is the first work to simultaneously apply\nhypergraph with node group connections and hyperbolic geometry into this field.\nExtensive experiments on several real world datasets of different fields\ndemonstrate the superiority of HC-GLAD on UGAD task. The code is available at\nhttps://github.com/Yali-F/HC-GLAD.\n","authors":["Yali Fu","Jindong Li","Jiahong Liu","Qianli Xing","Qi Wang","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2407.02057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19765v2","updated":"2024-07-02T08:29:56Z","published":"2024-06-28T09:10:23Z","title":"Systematic Literature Review on Application of Learning-based Approaches\n in Continuous Integration","summary":" Context: Machine learning (ML) and deep learning (DL) analyze raw data to\nextract valuable insights in specific phases. The rise of continuous practices\nin software projects emphasizes automating Continuous Integration (CI) with\nthese learning-based methods, while the growing adoption of such approaches\nunderscores the need for systematizing knowledge. Objective: Our objective is\nto comprehensively review and analyze existing literature concerning\nlearning-based methods within the CI domain. We endeavour to identify and\nanalyse various techniques documented in the literature, emphasizing the\nfundamental attributes of training phases within learning-based solutions in\nthe context of CI. Method: We conducted a Systematic Literature Review (SLR)\ninvolving 52 primary studies. Through statistical and thematic analyses, we\nexplored the correlations between CI tasks and the training phases of\nlearning-based methodologies across the selected studies, encompassing a\nspectrum from data engineering techniques to evaluation metrics. Results: This\npaper presents an analysis of the automation of CI tasks utilizing\nlearning-based methods. We identify and analyze nine types of data sources,\nfour steps in data preparation, four feature types, nine subsets of data\nfeatures, five approaches for hyperparameter selection and tuning, and fifteen\nevaluation metrics. Furthermore, we discuss the latest techniques employed,\nexisting gaps in CI task automation, and the characteristics of the utilized\nlearning-based techniques. Conclusion: This study provides a comprehensive\noverview of learning-based methods in CI, offering valuable insights for\nresearchers and practitioners developing CI task automation. It also highlights\nthe need for further research to advance these methods in CI.\n","authors":["Ali Kazemi Arani","Triet Huynh Minh Le","Mansooreh Zahedi","M. Ali Babar"],"pdf_url":"https://arxiv.org/pdf/2406.19765v2.pdf","comment":"This paper has been accepted to be published in IEEE Access"},{"id":"http://arxiv.org/abs/2406.17745v2","updated":"2024-07-02T08:05:55Z","published":"2024-06-25T17:31:04Z","title":"Light-weight End-to-End Graph Interest Network for CTR Prediction in\n E-commerce Search","summary":" Click-through-rate (CTR) prediction has an essential impact on improving user\nexperience and revenue in e-commerce search. With the development of deep\nlearning, graph-based methods are well exploited to utilize graph structure\nextracted from user behaviors and other information to help embedding learning.\nHowever, most of the previous graph-based methods mainly focus on\nrecommendation scenarios, and therefore their graph structures highly depend on\nitem's sequential information from user behaviors, ignoring query's sequential\nsignal and query-item correlation. In this paper, we propose a new approach\nnamed Light-weight End-to-End Graph Interest Network (EGIN) to effectively mine\nusers' search interests and tackle previous challenges. (i) EGIN utilizes query\nand item's correlation and sequential information from the search system to\nbuild a heterogeneous graph for better CTR prediction in e-commerce search.\n(ii) EGIN's graph embedding learning shares the same training input and is\njointly trained with CTR prediction, making the end-to-end framework effortless\nto deploy in large-scale search systems. The proposed EGIN is composed of three\nparts: query-item heterogeneous graph, light-weight graph sampling, and\nmulti-interest network. The query-item heterogeneous graph captures correlation\nand sequential information of query and item efficiently by the proposed\nlight-weight graph sampling. The multi-interest network is well designed to\nutilize graph embedding to capture various similarity relationships between\nquery and item to enhance the final CTR prediction. We conduct extensive\nexperiments on both public and industrial datasets to demonstrate the\neffectiveness of the proposed EGIN. At the same time, the training cost of\ngraph learning is relatively low compared with the main CTR prediction task,\nensuring efficiency in practical applications.\n","authors":["Pai Peng","Yunqing Jia","Ziqiang Zhou","Shuang Hong","Zichong Xiao"],"pdf_url":"https://arxiv.org/pdf/2406.17745v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.02031v1","updated":"2024-07-02T07:59:08Z","published":"2024-07-02T07:59:08Z","title":"SwiftDiffusion: Efficient Diffusion Model Serving with Add-on Modules","summary":" This paper documents our characterization study and practices for serving\ntext-to-image requests with stable diffusion models in production. We first\ncomprehensively analyze inference request traces for commercial text-to-image\napplications. It commences with our observation that add-on modules, i.e.,\nControlNets and LoRAs, that augment the base stable diffusion models, are\nubiquitous in generating images for commercial applications. Despite their\nefficacy, these add-on modules incur high loading overhead, prolong the serving\nlatency, and swallow up expensive GPU resources. Driven by our characterization\nstudy, we present SwiftDiffusion, a system that efficiently generates\nhigh-quality images using stable diffusion models and add-on modules. To\nachieve this, SwiftDiffusion reconstructs the existing text-to-image serving\nworkflow by identifying the opportunities for parallel computation and\ndistributing ControlNet computations across multiple GPUs. Further,\nSwiftDiffusion thoroughly analyzes the dynamics of image generation and\ndevelops techniques to eliminate the overhead associated with LoRA loading and\npatching while preserving the image quality. Last, SwiftDiffusion proposes\nspecialized optimizations in the backbone architecture of the stable diffusion\nmodels, which are also compatible with the efficient serving of add-on modules.\nCompared to state-of-the-art text-to-image serving systems, SwiftDiffusion\nreduces serving latency by up to 5x and improves serving throughput by up to 2x\nwithout compromising image quality.\n","authors":["Suyi Li","Lingyun Yang","Xiaoxiao Jiang","Hanfeng Lu","Zhipeng Di","Weiyi Lu","Jiawei Chen","Kan Liu","Yinghao Yu","Tao Lan","Guodong Yang","Lin Qu","Liping Zhang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02028v1","updated":"2024-07-02T07:52:30Z","published":"2024-07-02T07:52:30Z","title":"Why does in-context learning fail sometimes? Evaluating in-context\n learning on open and closed questions","summary":" We measure the performance of in-context learning as a function of task\nnovelty and difficulty for open and closed questions. For that purpose, we\ncreated a novel benchmark consisting of hard scientific questions, each paired\nwith a context of various relevancy. We show that counter-intuitively, a\ncontext that is more aligned with the topic does not always help more than a\nless relevant context. This effect is especially visible for open questions and\nquestions of high difficulty or novelty. This result reveals a fundamental\ndifference between the treatment of close-form and open-form questions by\nlarge-language models and shows a need for a more robust evaluation of\nin-context learning on the variety of different types of questions. It also\nposes a new question of how to optimally select a context for large language\nmodels, especially in the context of Retrieval Augmented Generation (RAG)\nsystems. Our results suggest that the answer to this question can be highly\napplication-dependent and might be contingent on factors including the format\nof the question, the perceived difficulty level of the questions, and the\nnovelty or popularity of the information we seek.\n","authors":["Xiang Li","Haoran Tang","Siyu Chen","Ziwei Wang","Ryan Chen","Marcin Abram"],"pdf_url":"https://arxiv.org/pdf/2407.02028v1.pdf","comment":"8 pages plus references, 4 main figures, 6 pages of supplementary\n material"},{"id":"http://arxiv.org/abs/2407.02025v1","updated":"2024-07-02T07:48:22Z","published":"2024-07-02T07:48:22Z","title":"On the Expressive Power of Sparse Geometric MPNNs","summary":" Motivated by applications in chemistry and other sciences, we study the\nexpressive power of message-passing neural networks for geometric graphs, whose\nnode features correspond to 3-dimensional positions. Recent work has shown that\nsuch models can separate generic pairs of non-equivalent geometric graphs,\nthough they may fail to separate some rare and complicated instances. However,\nthese results assume a fully connected graph, where each node possesses\ncomplete knowledge of all other nodes. In contrast, often, in application,\nevery node only possesses knowledge of a small number of nearest neighbors.\nThis paper shows that generic pairs of non-equivalent geometric graphs can be\nseparated by message-passing networks with rotation equivariant features as\nlong as the underlying graph is connected. When only invariant intermediate\nfeatures are allowed, generic separation is guaranteed for generically globally\nrigid graphs. We introduce a simple architecture, EGENNET, which achieves our\ntheoretical guarantees and compares favorably with alternative architecture on\nsynthetic and chemical benchmarks.\n","authors":["Yonatan Sverdlov","Nadav Dym"],"pdf_url":"https://arxiv.org/pdf/2407.02025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02013v1","updated":"2024-07-02T07:33:40Z","published":"2024-07-02T07:33:40Z","title":"DiGRAF: Diffeomorphic Graph-Adaptive Activation Function","summary":" In this paper, we propose a novel activation function tailored specifically\nfor graph data in Graph Neural Networks (GNNs). Motivated by the need for\ngraph-adaptive and flexible activation functions, we introduce DiGRAF,\nleveraging Continuous Piecewise-Affine Based (CPAB) transformations, which we\naugment with an additional GNN to learn a graph-adaptive diffeomorphic\nactivation function in an end-to-end manner. In addition to its\ngraph-adaptivity and flexibility, DiGRAF also possesses properties that are\nwidely recognized as desirable for activation functions, such as\ndifferentiability, boundness within the domain and computational efficiency. We\nconduct an extensive set of experiments across diverse datasets and tasks,\ndemonstrating a consistent and superior performance of DiGRAF compared to\ntraditional and graph-specific activation functions, highlighting its\neffectiveness as an activation function for GNNs.\n","authors":["Krishna Sri Ipsit Mantri","Xinzhi Wang","Carola-Bibiane Schönlieb","Bruno Ribeiro","Beatrice Bevilacqua","Moshe Eliasof"],"pdf_url":"https://arxiv.org/pdf/2407.02013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10020v2","updated":"2024-07-02T07:29:04Z","published":"2024-05-16T12:02:02Z","title":"Natural Language Can Help Bridge the Sim2Real Gap","summary":" The main challenge in learning image-conditioned robotic policies is\nacquiring a visual representation conducive to low-level control. Due to the\nhigh dimensionality of the image space, learning a good visual representation\nrequires a considerable amount of visual data. However, when learning in the\nreal world, data is expensive. Sim2Real is a promising paradigm for overcoming\ndata scarcity in the real-world target domain by using a simulator to collect\nlarge amounts of cheap data closely related to the target task. However, it is\ndifficult to transfer an image-conditioned policy from sim to real when the\ndomains are very visually dissimilar. To bridge the sim2real visual gap, we\npropose using natural language descriptions of images as a unifying signal\nacross domains that captures the underlying task-relevant semantics. Our key\ninsight is that if two image observations from different domains are labeled\nwith similar language, the policy should predict similar action distributions\nfor both images. We demonstrate that training the image encoder to predict the\nlanguage description or the distance between descriptions of a sim or real\nimage serves as a useful, data-efficient pretraining step that helps learn a\ndomain-invariant image representation. We can then use this image encoder as\nthe backbone of an IL policy trained simultaneously on a large amount of\nsimulated and a handful of real demonstrations. Our approach outperforms widely\nused prior sim2real methods and strong vision-language pretraining baselines\nlike CLIP and R3M by 25 to 40%. See additional videos and materials at\nhttps://robin-lab.cs.utexas.edu/lang4sim2real/.\n","authors":["Albert Yu","Adeline Foote","Raymond Mooney","Roberto Martín-Martín"],"pdf_url":"https://arxiv.org/pdf/2405.10020v2.pdf","comment":"To appear in RSS 2024. Project website at\n https://robin-lab.cs.utexas.edu/lang4sim2real/"},{"id":"http://arxiv.org/abs/2407.02010v1","updated":"2024-07-02T07:29:02Z","published":"2024-07-02T07:29:02Z","title":"Feynman-Kac Operator Expectation Estimator","summary":" The Feynman-Kac Operator Expectation Estimator (FKEE) is an innovative method\nfor estimating the target Mathematical Expectation $\\mathbb{E}_{X\\sim P}[f(X)]$\nwithout relying on a large number of samples, in contrast to the commonly used\nMarkov Chain Monte Carlo (MCMC) Expectation Estimator. FKEE comprises diffusion\nbridge models and approximation of the Feynman-Kac operator. The key idea is to\nuse the solution to the Feynmann-Kac equation at the initial time\n$u(x_0,0)=\\mathbb{E}[f(X_T)|X_0=x_0]$. We use Physically Informed Neural\nNetworks (PINN) to approximate the Feynman-Kac operator, which enables the\nincorporation of diffusion bridge models into the expectation estimator and\nsignificantly improves the efficiency of using data while substantially\nreducing the variance. Diffusion Bridge Model is a more general MCMC method. In\norder to incorporate extensive MCMC algorithms, we propose a new diffusion\nbridge model based on the Minimum Wasserstein distance. This diffusion bridge\nmodel is universal and reduces the training time of the PINN. FKEE also reduces\nthe adverse impact of the curse of dimensionality and weakens the assumptions\non the distribution of $X$ and performance function $f$ in the general MCMC\nexpectation estimator. The theoretical properties of this universal diffusion\nbridge model are also shown. Finally, we demonstrate the advantages and\npotential applications of this method through various concrete experiments,\nincluding the challenging task of approximating the partition function in the\nrandom graph model such as the Ising model.\n","authors":["Jingyuan Li","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05564v4","updated":"2024-07-02T07:27:27Z","published":"2023-08-10T13:24:45Z","title":"Large Skew-t Copula Models and Asymmetric Dependence in Intraday Equity\n Returns","summary":" Skew-t copula models are attractive for the modeling of financial data\nbecause they allow for asymmetric and extreme tail dependence. We show that the\ncopula implicit in the skew-t distribution of Azzalini and Capitanio (2003)\nallows for a higher level of pairwise asymmetric dependence than two popular\nalternative skew-t copulas. Estimation of this copula in high dimensions is\nchallenging, and we propose a fast and accurate Bayesian variational inference\n(VI) approach to do so. The method uses a generative representation of the\nskew-t distribution to define an augmented posterior that can be approximated\naccurately. A stochastic gradient ascent algorithm is used to solve the\nvariational optimization. The methodology is used to estimate skew-t factor\ncopula models with up to 15 factors for intraday returns from 2017 to 2021 on\n93 U.S. equities. The copula captures substantial heterogeneity in asymmetric\ndependence over equity pairs, in addition to the variability in pairwise\ncorrelations. In a moving window study we show that the asymmetric dependencies\nalso vary over time, and that intraday predictive densities from the skew-t\ncopula are more accurate than those from benchmark copula models. Portfolio\nselection strategies based on the estimated pairwise asymmetric dependencies\nimprove performance relative to the index.\n","authors":["Lin Deng","Michael Stanley Smith","Worapree Maneesoonthorn"],"pdf_url":"https://arxiv.org/pdf/2308.05564v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01991v1","updated":"2024-07-02T07:06:49Z","published":"2024-07-02T07:06:49Z","title":"Generation of Geodesics with Actor-Critic Reinforcement Learning to\n Predict Midpoints","summary":" To find the shortest paths for all pairs on continuous manifolds with\ninfinitesimally defined metrics, we propose to generate them by predicting\nmidpoints recursively and an actor-critic method to learn midpoint prediction.\nWe prove the soundness of our approach and show experimentally that the\nproposed method outperforms existing methods on both local and global path\nplanning tasks.\n","authors":["Kazumi Kasaura"],"pdf_url":"https://arxiv.org/pdf/2407.01991v1.pdf","comment":"15 pages with 6 pages of appendices and references, 8 figures"},{"id":"http://arxiv.org/abs/2405.05714v2","updated":"2024-07-02T07:06:15Z","published":"2024-05-08T12:13:40Z","title":"Estimating Noisy Class Posterior with Part-level Labels for Noisy Label\n Learning","summary":" In noisy label learning, estimating noisy class posteriors plays a\nfundamental role for developing consistent classifiers, as it forms the basis\nfor estimating clean class posteriors and the transition matrix. Existing\nmethods typically learn noisy class posteriors by training a classification\nmodel with noisy labels. However, when labels are incorrect, these models may\nbe misled to overemphasize the feature parts that do not reflect the instance\ncharacteristics, resulting in significant errors in estimating noisy class\nposteriors. To address this issue, this paper proposes to augment the\nsupervised information with part-level labels, encouraging the model to focus\non and integrate richer information from various parts. Specifically, our\nmethod first partitions features into distinct parts by cropping instances,\nyielding part-level labels associated with these various parts. Subsequently,\nwe introduce a novel single-to-multiple transition matrix to model the\nrelationship between the noisy and part-level labels, which incorporates\npart-level labels into a classifier-consistent framework. Utilizing this\nframework with part-level labels, we can learn the noisy class posteriors more\nprecisely by guiding the model to integrate information from various parts,\nultimately improving the classification performance. Our method is\ntheoretically sound, while experiments show that it is empirically effective in\nsynthetic and real-world noisy benchmarks.\n","authors":["Rui Zhao","Bin Shi","Jianfei Ruan","Tianze Pan","Bo Dong"],"pdf_url":"https://arxiv.org/pdf/2405.05714v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2205.11168v4","updated":"2024-07-02T06:57:21Z","published":"2022-05-23T10:15:00Z","title":"Logarithmic regret bounds for continuous-time average-reward Markov\n decision processes","summary":" We consider reinforcement learning for continuous-time Markov decision\nprocesses (MDPs) in the infinite-horizon, average-reward setting. In contrast\nto discrete-time MDPs, a continuous-time process moves to a state and stays\nthere for a random holding time after an action is taken. With unknown\ntransition probabilities and rates of exponential holding times, we derive\ninstance-dependent regret lower bounds that are logarithmic in the time\nhorizon. Moreover, we design a learning algorithm and establish a finite-time\nregret bound that achieves the logarithmic growth rate. Our analysis builds\nupon upper confidence reinforcement learning, a delicate estimation of the mean\nholding times, and stochastic comparison of point processes.\n","authors":["Xuefeng Gao","Xun Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2205.11168v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03780v2","updated":"2024-07-02T06:54:56Z","published":"2023-11-07T07:53:06Z","title":"DynaSemble: Dynamic Ensembling of Textual and Structure-Based Models for\n Knowledge Graph Completion","summary":" We consider two popular approaches to Knowledge Graph Completion (KGC):\ntextual models that rely on textual entity descriptions, and structure-based\nmodels that exploit the connectivity structure of the Knowledge Graph (KG).\nPreliminary experiments show that these approaches have complementary\nstrengths: structure-based models perform exceptionally well when the gold\nanswer is easily reachable from the query head in the KG, while textual models\nexploit descriptions to give good performance even when the gold answer is not\neasily reachable. In response, we propose DynaSemble, a novel method for\nlearning query-dependent ensemble weights to combine these approaches by using\nthe distributions of scores assigned by the models in the ensemble to all\ncandidate entities. DynaSemble achieves state-of-the-art results on three\nstandard KGC datasets, with up to 6.8 pt MRR and 8.3 pt Hits@1 gains over the\nbest baseline model for the WN18RR dataset.\n","authors":["Ananjan Nandi","Navdeep Kaur","Parag Singla"," Mausam"],"pdf_url":"https://arxiv.org/pdf/2311.03780v2.pdf","comment":"12 pages, 2 figures, 15 tables Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2407.01985v1","updated":"2024-07-02T06:54:46Z","published":"2024-07-02T06:54:46Z","title":"The Epistemic Uncertainty Hole: an issue of Bayesian Neural Networks","summary":" Bayesian Deep Learning (BDL) gives access not only to aleatoric uncertainty,\nas standard neural networks already do, but also to epistemic uncertainty, a\nmeasure of confidence a model has in its own predictions. In this article, we\nshow through experiments that the evolution of epistemic uncertainty metrics\nregarding the model size and the size of the training set, goes against\ntheoretical expectations. More precisely, we observe that the epistemic\nuncertainty collapses literally in the presence of large models and sometimes\nalso of little training data, while we expect the exact opposite behaviour.\nThis phenomenon, which we call \"epistemic uncertainty hole\", is all the more\nproblematic as it undermines the entire applicative potential of BDL, which is\nbased precisely on the use of epistemic uncertainty. As an example, we evaluate\nthe practical consequences of this uncertainty hole on one of the main\napplications of BDL, namely the detection of out-of-distribution samples\n","authors":["Mohammed Fellaji","Frédéric Pennerath"],"pdf_url":"https://arxiv.org/pdf/2407.01985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00116v2","updated":"2024-07-02T06:51:09Z","published":"2024-06-27T14:00:11Z","title":"Generative AI for Synthetic Data Across Multiple Medical Modalities: A\n Systematic Review of Recent Developments and Challenges","summary":" This paper presents a comprehensive systematic review of generative models\n(GANs, VAEs, DMs, and LLMs) used to synthesize various medical data types,\nincluding imaging (dermoscopic, mammographic, ultrasound, CT, MRI, and X-ray),\ntext, time-series, and tabular data (EHR). Unlike previous narrowly focused\nreviews, our study encompasses a broad array of medical data modalities and\nexplores various generative models. Our search strategy queries databases such\nas Scopus, PubMed, and ArXiv, focusing on recent works from January 2021 to\nNovember 2023, excluding reviews and perspectives. This period emphasizes\nrecent advancements beyond GANs, which have been extensively covered\npreviously.\n The survey reveals insights from three key aspects: (1) Synthesis\napplications and purpose of synthesis, (2) generation techniques, and (3)\nevaluation methods. It highlights clinically valid synthesis applications,\ndemonstrating the potential of synthetic data to tackle diverse clinical\nrequirements. While conditional models incorporating class labels, segmentation\nmasks and image translations are prevalent, there is a gap in utilizing prior\nclinical knowledge and patient-specific context, suggesting a need for more\npersonalized synthesis approaches and emphasizing the importance of tailoring\ngenerative approaches to the unique characteristics of medical data.\nAdditionally, there is a significant gap in using synthetic data beyond\naugmentation, such as for validation and evaluation of downstream medical AI\nmodels. The survey uncovers that the lack of standardized evaluation\nmethodologies tailored to medical images is a barrier to clinical application,\nunderscoring the need for in-depth evaluation approaches, benchmarking, and\ncomparative studies to promote openness and collaboration.\n","authors":["Mahmoud Ibrahim","Yasmina Al Khalil","Sina Amirrajab","Chang Sun","Marcel Breeuwer","Josien Pluim","Bart Elen","Gokhan Ertaylan","Michel Dumontier"],"pdf_url":"https://arxiv.org/pdf/2407.00116v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01979v1","updated":"2024-07-02T06:31:13Z","published":"2024-07-02T06:31:13Z","title":"Unveiling Global Interactive Patterns across Graphs: Towards\n Interpretable Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have emerged as a prominent framework for graph\nmining, leading to significant advances across various domains. Stemmed from\nthe node-wise representations of GNNs, existing explanation studies have\nembraced the subgraph-specific viewpoint that attributes the decision results\nto the salient features and local structures of nodes. However, graph-level\ntasks necessitate long-range dependencies and global interactions for advanced\nGNNs, deviating significantly from subgraph-specific explanations. To bridge\nthis gap, this paper proposes a novel intrinsically interpretable scheme for\ngraph classification, termed as Global Interactive Pattern (GIP) learning,\nwhich introduces learnable global interactive patterns to explicitly interpret\ndecisions. GIP first tackles the complexity of interpretation by clustering\nnumerous nodes using a constrained graph clustering module. Then, it matches\nthe coarsened global interactive instance with a batch of self-interpretable\ngraph prototypes, thereby facilitating a transparent graph-level reasoning\nprocess. Extensive experiments conducted on both synthetic and real-world\nbenchmarks demonstrate that the proposed GIP yields significantly superior\ninterpretability and competitive performance to~the state-of-the-art\ncounterparts. Our code will be made publicly available.\n","authors":["Yuwen Wang","Shunyu Liu","Tongya Zheng","Kaixuan Chen","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2407.01979v1.pdf","comment":"Accepted in KDD2024"},{"id":"http://arxiv.org/abs/2402.15350v2","updated":"2024-07-02T06:12:05Z","published":"2024-02-23T14:38:05Z","title":"Farsight: Fostering Responsible AI Awareness During AI Application\n Prototyping","summary":" Prompt-based interfaces for Large Language Models (LLMs) have made\nprototyping and building AI-powered applications easier than ever before.\nHowever, identifying potential harms that may arise from AI applications\nremains a challenge, particularly during prompt-based prototyping. To address\nthis, we present Farsight, a novel in situ interactive tool that helps people\nidentify potential harms from the AI applications they are prototyping. Based\non a user's prompt, Farsight highlights news articles about relevant AI\nincidents and allows users to explore and edit LLM-generated use cases,\nstakeholders, and harms. We report design insights from a co-design study with\n10 AI prototypers and findings from a user study with 42 AI prototypers. After\nusing Farsight, AI prototypers in our user study are better able to\nindependently identify potential harms associated with a prompt and find our\ntool more useful and usable than existing resources. Their qualitative feedback\nalso highlights that Farsight encourages them to focus on end-users and think\nbeyond immediate harms. We discuss these findings and reflect on their\nimplications for designing AI prototyping experiences that meaningfully engage\nwith AI harms. Farsight is publicly accessible at:\nhttps://PAIR-code.github.io/farsight.\n","authors":["Zijie J. Wang","Chinmay Kulkarni","Lauren Wilcox","Michael Terry","Michael Madaio"],"pdf_url":"https://arxiv.org/pdf/2402.15350v2.pdf","comment":"Accepted to CHI 2024 (Best Paper, Honorable Mention). 40 pages, 19\n figures, 5 tables. For a demo video, see https://youtu.be/BlSFbGkOlHk. For a\n live demo, visit https://PAIR-code.github.io/farsight. The source code is\n available at https://github.com/PAIR-code/farsight"},{"id":"http://arxiv.org/abs/2407.01972v1","updated":"2024-07-02T06:08:55Z","published":"2024-07-02T06:08:55Z","title":"MeMemo: On-device Retrieval Augmentation for Private and Personalized\n Text Generation","summary":" Retrieval-augmented text generation (RAG) addresses the common limitations of\nlarge language models (LLMs), such as hallucination, by retrieving information\nfrom an updatable external knowledge base. However, existing approaches often\nrequire dedicated backend servers for data storage and retrieval, thereby\nlimiting their applicability in use cases that require strict data privacy,\nsuch as personal finance, education, and medicine. To address the pressing need\nfor client-side dense retrieval, we introduce MeMemo, the first open-source\nJavaScript toolkit that adapts the state-of-the-art approximate nearest\nneighbor search technique HNSW to browser environments. Developed with modern\nand native Web technologies, such as IndexedDB and Web Workers, our toolkit\nleverages client-side hardware capabilities to enable researchers and\ndevelopers to efficiently search through millions of high-dimensional vectors\nin the browser. MeMemo enables exciting new design and research opportunities,\nsuch as private and personalized content creation and interactive prototyping,\nas demonstrated in our example application RAG Playground. Reflecting on our\nwork, we discuss the opportunities and challenges for on-device dense\nretrieval. MeMemo is available at https://github.com/poloclub/mememo.\n","authors":["Zijie J. Wang","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2407.01972v1.pdf","comment":"Accepted to SIGIR 2024. 6 pages, 2 figures. For a live demo, visit\n https://poloclub.github.io/mememo/. Code is open-source at\n https://github.com/poloclub/mememo"},{"id":"http://arxiv.org/abs/2404.02180v3","updated":"2024-07-02T05:52:15Z","published":"2024-04-02T09:15:32Z","title":"Remote sensing framework for geological mapping via stacked autoencoders\n and clustering","summary":" Supervised machine learning methods for geological mapping via remote sensing\nface limitations due to the scarcity of accurately labelled training data that\ncan be addressed by unsupervised learning, such as dimensionality reduction and\nclustering. Dimensionality reduction methods have the potential to play a\ncrucial role in improving the accuracy of geological maps. Although\nconventional dimensionality reduction methods may struggle with nonlinear data,\nunsupervised deep learning models such as autoencoders can model non-linear\nrelationships. Stacked autoencoders feature multiple interconnected layers to\ncapture hierarchical data representations useful for remote sensing data. This\nstudy presents an unsupervised machine learning-based framework for processing\nremote sensing data using stacked autoencoders for dimensionality reduction and\nk-means clustering for mapping geological units. We use Landsat 8, ASTER, and\nSentinel-2 datasets to evaluate the framework for geological mapping of the\nMutawintji region in Western New South Wales, Australia. We also compare\nstacked autoencoders with principal component analysis and canonical\nautoencoders. Our results reveal that the framework produces accurate and\ninterpretable geological maps, efficiently discriminating rock units. We find\nthat the accuracy of stacked autoencoders ranges from 86.6 % to 90 %, depending\non the remote sensing data type, which is superior to their counterparts. We\nalso find that the generated maps align with prior geological knowledge of the\nstudy area while providing novel insights into geological structures.\n","authors":["Sandeep Nagar","Ehsan Farahbakhsh","Joseph Awange","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2404.02180v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01960v1","updated":"2024-07-02T05:31:59Z","published":"2024-07-02T05:31:59Z","title":"Zero-shot Video Restoration and Enhancement Using Pre-Trained Image\n Diffusion Model","summary":" Diffusion-based zero-shot image restoration and enhancement models have\nachieved great success in various image restoration and enhancement tasks\nwithout training. However, directly applying them to video restoration and\nenhancement results in severe temporal flickering artifacts. In this paper, we\npropose the first framework for zero-shot video restoration and enhancement\nbased on a pre-trained image diffusion model. By replacing the self-attention\nlayer with the proposed cross-previous-frame attention layer, the pre-trained\nimage diffusion model can take advantage of the temporal correlation between\nneighboring frames. We further propose temporal consistency guidance,\nspatial-temporal noise sharing, and an early stopping sampling strategy for\nbetter temporally consistent sampling. Our method is a plug-and-play module\nthat can be inserted into any diffusion-based zero-shot image restoration or\nenhancement methods to further improve their performance. Experimental results\ndemonstrate the superiority of our proposed method in producing temporally\nconsistent videos with better fidelity.\n","authors":["Cong Cao","Huanjing Yue","Xin Liu","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01960v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2310.04218v4","updated":"2024-07-02T05:19:32Z","published":"2023-10-06T13:05:07Z","title":"A Fixed-Parameter Tractable Algorithm for Counting Markov Equivalence\n Classes with the same Skeleton","summary":" Causal DAGs (also known as Bayesian networks) are a popular tool for encoding\nconditional dependencies between random variables. In a causal DAG, the random\nvariables are modeled as vertices in the DAG, and it is stipulated that every\nrandom variable is independent of its ancestors conditioned on its parents. It\nis possible, however, for two different causal DAGs on the same set of random\nvariables to encode exactly the same set of conditional dependencies. Such\ncausal DAGs are said to be Markov equivalent, and equivalence classes of Markov\nequivalent DAGs are known as Markov Equivalent Classes (MECs). Beautiful\ncombinatorial characterizations of MECs have been developed in the past few\ndecades, and it is known, in particular that all DAGs in the same MEC must have\nthe same \"skeleton\" (underlying undirected graph) and v-structures (induced\nsubgraph of the form $a\\rightarrow b \\leftarrow c$).\n These combinatorial characterizations also suggest several natural\nalgorithmic questions. One of these is: given an undirected graph $G$ as input,\nhow many distinct Markov equivalence classes have the skeleton $G$? Much work\nhas been devoted in the last few years to this and other closely related\nproblems. However, to the best of our knowledge, a polynomial time algorithm\nfor the problem remains unknown.\n In this paper, we make progress towards this goal by giving a fixed parameter\ntractable algorithm for the above problem, with the parameters being the\ntreewidth and the maximum degree of the input graph $G$. The main technical\ningredient in our work is a construction we refer to as shadow, which lets us\ncreate a \"local description\" of long-range constraints imposed by the\ncombinatorial characterizations of MECs.\n","authors":["Vidya Sagar Sharma"],"pdf_url":"https://arxiv.org/pdf/2310.04218v4.pdf","comment":"75 pages, 2 Figures"},{"id":"http://arxiv.org/abs/2405.13937v4","updated":"2024-07-02T05:14:10Z","published":"2024-05-22T19:10:24Z","title":"DyGPrompt: Learning Feature and Time Prompts on Dynamic Graphs","summary":" Dynamic graphs are pervasive in the real world, modeling dynamic relations\nbetween objects across various fields. For dynamic graph modeling, dynamic\ngraph neural networks (DGNNs) have emerged as a mainstream technique, which are\ngenerally pre-trained on the link prediction task, leaving a significant gap\nfrom the objectives of downstream tasks such as node classification. To bridge\nthe gap, prompt-based learning has gained traction on graphs. However, existing\nefforts focus on static graphs, neglecting the evolution of dynamic graphs. In\nthis paper, we propose DyGPrompt, a novel pre-training and prompting framework\nfor dynamic graph modeling. First, we design dual prompts to address the gap in\nboth task objectives and dynamic variations across pre-training and downstream\ntasks. Second, we recognize that node and time features mutually characterize\neach other, and propose dual condition-nets to model the evolving node-time\npatterns in downstream tasks. Finally, we thoroughly evaluate and analyze\nDyGPrompt through extensive experiments on three public datasets.\n","authors":["Xingtong Yu","Zhenghao Liu","Yuan Fang","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.13937v4.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.01953v1","updated":"2024-07-02T05:04:13Z","published":"2024-07-02T05:04:13Z","title":"CatMemo at the FinLLM Challenge Task: Fine-Tuning Large Language Models\n using Data Fusion in Financial Applications","summary":" The integration of Large Language Models (LLMs) into financial analysis has\ngarnered significant attention in the NLP community. This paper presents our\nsolution to IJCAI-2024 FinLLM challenge, investigating the capabilities of LLMs\nwithin three critical areas of financial tasks: financial classification,\nfinancial text summarization, and single stock trading. We adopted Llama3-8B\nand Mistral-7B as base models, fine-tuning them through Parameter Efficient\nFine-Tuning (PEFT) and Low-Rank Adaptation (LoRA) approaches. To enhance model\nperformance, we combine datasets from task 1 and task 2 for data fusion. Our\napproach aims to tackle these diverse tasks in a comprehensive and integrated\nmanner, showcasing LLMs' capacity to address diverse and complex financial\ntasks with improved accuracy and decision-making capabilities.\n","authors":["Yupeng Cao","Zhiyuan Yao","Zhi Chen","Zhiyang Deng"],"pdf_url":"https://arxiv.org/pdf/2407.01953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01948v1","updated":"2024-07-02T04:39:19Z","published":"2024-07-02T04:39:19Z","title":"Extracting and Encoding: Leveraging Large Language Models and Medical\n Knowledge to Enhance Radiological Text Representation","summary":" Advancing representation learning in specialized fields like medicine remains\nchallenging due to the scarcity of expert annotations for text and images. To\ntackle this issue, we present a novel two-stage framework designed to extract\nhigh-quality factual statements from free-text radiology reports in order to\nimprove the representations of text encoders and, consequently, their\nperformance on various downstream tasks. In the first stage, we propose a\n\\textit{Fact Extractor} that leverages large language models (LLMs) to identify\nfactual statements from well-curated domain-specific datasets. In the second\nstage, we introduce a \\textit{Fact Encoder} (CXRFE) based on a BERT model\nfine-tuned with objective functions designed to improve its representations\nusing the extracted factual data. Our framework also includes a new\nembedding-based metric (CXRFEScore) for evaluating chest X-ray text generation\nsystems, leveraging both stages of our approach. Extensive evaluations show\nthat our fact extractor and encoder outperform current state-of-the-art methods\nin tasks such as sentence ranking, natural language inference, and label\nextraction from radiology reports. Additionally, our metric proves to be more\nrobust and effective than existing metrics commonly used in the radiology\nreport generation literature. The code of this project is available at\n\\url{https://github.com/PabloMessina/CXR-Fact-Encoder}.\n","authors":["Pablo Messina","René Vidal","Denis Parra","Álvaro Soto","Vladimir Araujo"],"pdf_url":"https://arxiv.org/pdf/2407.01948v1.pdf","comment":"Accepted to ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2202.08433v3","updated":"2024-07-02T04:06:53Z","published":"2022-02-17T03:29:20Z","title":"ADD 2022: the First Audio Deep Synthesis Detection Challenge","summary":" Audio deepfake detection is an emerging topic, which was included in the\nASVspoof 2021. However, the recent shared tasks have not covered many real-life\nand challenging scenarios. The first Audio Deep synthesis Detection challenge\n(ADD) was motivated to fill in the gap. The ADD 2022 includes three tracks:\nlow-quality fake audio detection (LF), partially fake audio detection (PF) and\naudio fake game (FG). The LF track focuses on dealing with bona fide and fully\nfake utterances with various real-world noises etc. The PF track aims to\ndistinguish the partially fake audio from the real. The FG track is a rivalry\ngame, which includes two tasks: an audio generation task and an audio fake\ndetection task. In this paper, we describe the datasets, evaluation metrics,\nand protocols. We also report major findings that reflect the recent advances\nin audio deepfake detection tasks.\n","authors":["Jiangyan Yi","Ruibo Fu","Jianhua Tao","Shuai Nie","Haoxin Ma","Chenglong Wang","Tao Wang","Zhengkun Tian","Xiaohui Zhang","Ye Bai","Cunhang Fan","Shan Liang","Shiming Wang","Shuai Zhang","Xinrui Yan","Le Xu","Zhengqi Wen","Haizhou Li","Zheng Lian","Bin Liu"],"pdf_url":"https://arxiv.org/pdf/2202.08433v3.pdf","comment":"Accepted by ICASSP 2022"},{"id":"http://arxiv.org/abs/2407.01258v2","updated":"2024-07-02T03:55:11Z","published":"2024-07-01T13:08:09Z","title":"Introducing a Physics-informed Deep Learning Framework for Bridge Scour\n Prediction","summary":" This paper introduces scour physics-informed neural networks (SPINNs), a\nhybrid physics-data-driven framework for bridge scour prediction using deep\nlearning. SPINNs are developed based on historical scour monitoring data and\nintegrate physics-based empirical equations into neural networks as\nsupplementary loss components. We incorporated three architectures: LSTM, CNN,\nand NLinear as the base data-driven model. Despite varying performance across\ndifferent base models and bridges, SPINNs overall outperformed pure data-driven\nmodels. In some bridge cases, SPINN reduced forecasting errors by up to 50\npercent. In this study, we also explored general models for bridge clusters,\ntrained by aggregating datasets across multiple bridges in a region. The pure\ndata-driven models mostly benefited from this approach, in particular bridges\nwith limited data. However, bridge-specific SPINNs provided more accurate\npredictions than general SPINNs for almost all case studies. Also, the\ntime-dependent empirical equations derived from SPINNs showed reasonable\naccuracy in estimating maximum scour depth, providing more accurate predictions\ncompared to HEC-18. Comparing both SPINNs and pure deep learning models with\ntraditional HEC-18 equation indicates substantial improvements in scour\nprediction accuracy. This study can pave the way for hybrid physics-machine\nlearning methodologies to be implemented for bridge scour design and\nmaintenance.\n","authors":["Negin Yousefpour","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2407.01258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01012v2","updated":"2024-07-02T03:48:06Z","published":"2024-07-01T06:52:34Z","title":"Swish-T : Enhancing Swish Activation with Tanh Bias for Improved Neural\n Network Performance","summary":" We propose the Swish-T family, an enhancement of the existing non-monotonic\nactivation function Swish. Swish-T is defined by adding a Tanh bias to the\noriginal Swish function. This modification creates a family of Swish-T\nvariants, each designed to excel in different tasks, showcasing specific\nadvantages depending on the application context. The Tanh bias allows for\nbroader acceptance of negative values during initial training stages, offering\na smoother non-monotonic curve than the original Swish. We ultimately propose\nthe Swish-T$_{\\textbf{C}}$ function, while Swish-T and Swish-T$_{\\textbf{B}}$,\nbyproducts of Swish-T$_{\\textbf{C}}$, also demonstrate satisfactory\nperformance. Furthermore, our ablation study shows that using\nSwish-T$_{\\textbf{C}}$ as a non-parametric function can still achieve high\nperformance. The superiority of the Swish-T family has been empirically\ndemonstrated across various models and benchmark datasets, including MNIST,\nFashion MNIST, SVHN, CIFAR-10, and CIFAR-100. The code is publicly available at\n\"https://github.com/ictseoyoungmin/Swish-T-pytorch\".\n","authors":["Youngmin Seo","Jinha Kim","Unsang Park"],"pdf_url":"https://arxiv.org/pdf/2407.01012v2.pdf","comment":"11 pages, 6 figures Revised the derivative of the sigmoid function\n from 1-sigmoid to sigmoid(1-sigmoid) for correctness.Updated related\n equations in Section 3.2 Conclusions to Conclusion in Section 6"},{"id":"http://arxiv.org/abs/2403.13107v2","updated":"2024-07-02T03:35:53Z","published":"2024-03-19T19:15:13Z","title":"Towards Unsupervised Question Answering System with Multi-level\n Summarization for Legal Text","summary":" This paper summarizes Team SCaLAR's work on SemEval-2024 Task 5: Legal\nArgument Reasoning in Civil Procedure. To address this Binary Classification\ntask, which was daunting due to the complexity of the Legal Texts involved, we\npropose a simple yet novel similarity and distance-based unsupervised approach\nto generate labels. Further, we explore the Multi-level fusion of Legal-Bert\nembeddings using ensemble features, including CNN, GRU, and LSTM. To address\nthe lengthy nature of Legal explanation in the dataset, we introduce T5-based\nsegment-wise summarization, which successfully retained crucial information,\nenhancing the model's performance. Our unsupervised system witnessed a 20-point\nincrease in macro F1-score on the development set and a 10-point increase on\nthe test set, which is promising given its uncomplicated architecture.\n","authors":["M Manvith Prabhu","Haricharana Srinivasa","Anand Kumar M"],"pdf_url":"https://arxiv.org/pdf/2403.13107v2.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2407.01920v1","updated":"2024-07-02T03:34:16Z","published":"2024-07-02T03:34:16Z","title":"To Forget or Not? Towards Practical Knowledge Unlearning for Large\n Language Models","summary":" Large Language Models (LLMs) trained on extensive corpora inevitably retain\nsensitive data, such as personal privacy information and copyrighted material.\nRecent advancements in knowledge unlearning involve updating LLM parameters to\nerase specific knowledge. However, current unlearning paradigms are mired in\nvague forgetting boundaries, often erasing knowledge indiscriminately. In this\nwork, we introduce KnowUnDo, a benchmark containing copyrighted content and\nuser privacy domains to evaluate if the unlearning process inadvertently erases\nessential knowledge. Our findings indicate that existing unlearning methods\noften suffer from excessive unlearning. To address this, we propose a simple\nyet effective method, MemFlex, which utilizes gradient information to precisely\ntarget and unlearn sensitive parameters. Experimental results show that MemFlex\nis superior to existing methods in both precise knowledge unlearning and\ngeneral knowledge retaining of LLMs. Code and dataset will be released at\nhttps://github.com/zjunlp/KnowUnDo.\n","authors":["Bozhong Tian","Xiaozhuan Liang","Siyuan Cheng","Qingbin Liu","Mengru Wang","Dianbo Sui","Xi Chen","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01920v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2404.15274v2","updated":"2024-07-02T03:31:16Z","published":"2024-04-23T17:59:12Z","title":"Metric-guided Image Reconstruction Bounds via Conformal Prediction","summary":" Recent advancements in machine learning have led to the development of novel\nmedical imaging systems and algorithms that address ill-posed problems.\nAssessing their trustworthiness and understanding how to deploy them safely at\ntest time remains an important and open problem. In this work, we propose using\nconformal prediction to compute valid and distribution-free bounds on\ndownstream metrics given reconstructions generated by one algorithm, and\nretrieve upper/lower bounds and inlier/outlier reconstructions according to the\nadjusted bounds. Our work offers 1) test time image reconstruction evaluation\nwithout ground truth, 2) downstream performance guarantees, 3) meaningful\nupper/lower bound reconstructions, and 4) meaningful statistical\ninliers/outlier reconstructions. We demonstrate our method on post-mastectomy\nradiotherapy planning using 3D breast CT reconstructions, and show 1) that\nmetric-guided bounds have valid coverage for downstream metrics while\nconventional pixel-wise bounds do not and 2) anatomical differences of\nupper/lower bounds between metric-guided and pixel-wise methods. Our work paves\nway for more meaningful and trustworthy test-time evaluation of medical image\nreconstructions. Code available at\nhttps://github.com/matthewyccheung/conformal-metric\n","authors":["Matt Y Cheung","Tucker J Netherton","Laurence E Court","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2404.15274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11469v2","updated":"2024-07-02T03:29:11Z","published":"2024-02-18T05:58:25Z","title":"A Curious Case of Searching for the Correlation between Training Data\n and Adversarial Robustness of Transformer Textual Models","summary":" Existing works have shown that fine-tuned textual transformer models achieve\nstate-of-the-art prediction performances but are also vulnerable to adversarial\ntext perturbations. Traditional adversarial evaluation is often done\n\\textit{only after} fine-tuning the models and ignoring the training data. In\nthis paper, we want to prove that there is also a strong correlation between\ntraining data and model robustness. To this end, we extract 13 different\nfeatures representing a wide range of input fine-tuning corpora properties and\nuse them to predict the adversarial robustness of the fine-tuned models.\nFocusing mostly on encoder-only transformer models BERT and RoBERTa with\nadditional results for BART, ELECTRA, and GPT2, we provide diverse evidence to\nsupport our argument. First, empirical analyses show that (a) extracted\nfeatures can be used with a lightweight classifier such as Random Forest to\npredict the attack success rate effectively, and (b) features with the most\ninfluence on the model robustness have a clear correlation with the robustness.\nSecond, our framework can be used as a fast and effective additional tool for\nrobustness evaluation since it (a) saves 30x-193x runtime compared to the\ntraditional technique, (b) is transferable across models, (c) can be used under\nadversarial training, and (d) robust to statistical randomness. Our code is\npublicly available at \\url{https://github.com/CaptainCuong/RobustText_ACL2024}.\n","authors":["Cuong Dang","Dung D. Le","Thai Le"],"pdf_url":"https://arxiv.org/pdf/2402.11469v2.pdf","comment":"Accepted to ACL Findings 2024"},{"id":"http://arxiv.org/abs/2407.00382v2","updated":"2024-07-02T03:22:04Z","published":"2024-06-29T09:35:12Z","title":"Towards Universal Mesh Movement Networks","summary":" Solving complex Partial Differential Equations (PDEs) accurately and\nefficiently is an essential and challenging problem in all scientific and\nengineering disciplines. Mesh movement methods provide the capability to\nimprove the accuracy of the numerical solution without increasing the overall\nmesh degree of freedom count. Conventional sophisticated mesh movement methods\nare extremely expensive and struggle to handle scenarios with complex boundary\ngeometries. However, existing learning-based methods require re-training from\nscratch given a different PDE type or boundary geometry, which limits their\napplicability, and also often suffer from robustness issues in the form of\ninverted elements. In this paper, we introduce the Universal Mesh Movement\nNetwork (UM2N), which -- once trained -- can be applied in a non-intrusive,\nzero-shot manner to move meshes with different size distributions and\nstructures, for solvers applicable to different PDE types and boundary\ngeometries. UM2N consists of a Graph Transformer (GT) encoder for extracting\nfeatures and a Graph Attention Network (GAT) based decoder for moving the mesh.\nWe evaluate our method on advection and Navier-Stokes based examples, as well\nas a real-world tsunami simulation case. Our method outperforms existing\nlearning-based mesh movement methods in terms of the benchmarks described\nabove. In comparison to the conventional sophisticated Monge-Amp\\`ere\nPDE-solver based method, our approach not only significantly accelerates mesh\nmovement, but also proves effective in scenarios where the conventional method\nfails. Our project page is at https://erizmr.github.io/UM2N/.\n","authors":["Mingrui Zhang","Chunyang Wang","Stephan Kramer","Joseph G. Wallwork","Siyi Li","Jiancheng Liu","Xiang Chen","Matthew D. Piggott"],"pdf_url":"https://arxiv.org/pdf/2407.00382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01910v1","updated":"2024-07-02T03:21:24Z","published":"2024-07-02T03:21:24Z","title":"MG-Verilog: Multi-grained Dataset Towards Enhanced LLM-assisted Verilog\n Generation","summary":" Large Language Models (LLMs) have recently shown promise in streamlining\nhardware design processes by encapsulating vast amounts of domain-specific\ndata. In addition, they allow users to interact with the design processes\nthrough natural language instructions, thus making hardware design more\naccessible to developers. However, effectively leveraging LLMs in hardware\ndesign necessitates providing domain-specific data during inference (e.g.,\nthrough in-context learning), fine-tuning, or pre-training. Unfortunately,\nexisting publicly available hardware datasets are often limited in size,\ncomplexity, or detail, which hinders the effectiveness of LLMs in hardware\ndesign tasks. To address this issue, we first propose a set of criteria for\ncreating high-quality hardware datasets that can effectively enhance\nLLM-assisted hardware design. Based on these criteria, we propose a\nMulti-Grained-Verilog (MG-Verilog) dataset, which encompasses descriptions at\nvarious levels of detail and corresponding code samples. To benefit the broader\nhardware design community, we have developed an open-source infrastructure that\nfacilitates easy access, integration, and extension of the dataset to meet\nspecific project needs. Furthermore, to fully exploit the potential of the\nMG-Verilog dataset, which varies in complexity and detail, we introduce a\nbalanced fine-tuning scheme. This scheme serves as a unique use case to\nleverage the diverse levels of detail provided by the dataset. Extensive\nexperiments demonstrate that the proposed dataset and fine-tuning scheme\nconsistently improve the performance of LLMs in hardware design tasks.\n","authors":["Yongan Zhang","Zhongzhi Yu","Yonggan Fu","Cheng Wan"," Yingyan"," Lin"],"pdf_url":"https://arxiv.org/pdf/2407.01910v1.pdf","comment":"Accepted in ISLAD 2024"},{"id":"http://arxiv.org/abs/2407.01907v1","updated":"2024-07-02T03:13:27Z","published":"2024-07-02T03:13:27Z","title":"The Solution for the ICCV 2023 Perception Test Challenge 2023 -- Task 6\n -- Grounded videoQA","summary":" In this paper, we introduce a grounded video question-answering solution. Our\nresearch reveals that the fixed official baseline method for video question\nanswering involves two main steps: visual grounding and object tracking.\nHowever, a significant challenge emerges during the initial step, where\nselected frames may lack clearly identifiable target objects. Furthermore,\nsingle images cannot address questions like \"Track the container from which the\nperson pours the first time.\" To tackle this issue, we propose an alternative\ntwo-stage approach:(1) First, we leverage the VALOR model to answer questions\nbased on video information.(2) concatenate the answered questions with their\nrespective answers. Finally, we employ TubeDETR to generate bounding boxes for\nthe targets.\n","authors":["Hailiang Zhang","Dian Chao","Zhihao Guan","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01906v1","updated":"2024-07-02T03:11:13Z","published":"2024-07-02T03:11:13Z","title":"Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning for\n Sparse Architectural Large Language Models","summary":" Parameter-efficient fine-tuning (PEFT) is crucial for customizing Large\nLanguage Models (LLMs) with constrained resources. Although there have been\nvarious PEFT methods for dense-architecture LLMs, PEFT for sparse-architecture\nLLMs is still underexplored. In this work, we study the PEFT method for LLMs\nwith the Mixture-of-Experts (MoE) architecture and the contents of this work\nare mainly threefold: (1) We investigate the dispersion degree of the activated\nexperts in customized tasks, and found that the routing distribution for a\nspecific task tends to be highly concentrated, while the distribution of\nactivated experts varies significantly across different tasks. (2) We propose\nExpert-Specialized Fine-Tuning, or ESFT, which tunes the experts most relevant\nto downstream tasks while freezing the other experts and modules; experimental\nresults demonstrate that our method not only improves the tuning efficiency,\nbut also matches or even surpasses the performance of full-parameter\nfine-tuning. (3) We further analyze the impact of the MoE architecture on\nexpert-specialized fine-tuning. We find that MoE models with finer-grained\nexperts are more advantageous in selecting the combination of experts that are\nmost relevant to downstream tasks, thereby enhancing both the training\nefficiency and effectiveness.\n","authors":["Zihan Wang","Deli Chen","Damai Dai","Runxin Xu","Zhuoshu Li","Y. Wu"],"pdf_url":"https://arxiv.org/pdf/2407.01906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01903v1","updated":"2024-07-02T03:08:20Z","published":"2024-07-02T03:08:20Z","title":"Text-Aware Diffusion for Policy Learning","summary":" Training an agent to achieve particular goals or perform desired behaviors is\noften accomplished through reinforcement learning, especially in the absence of\nexpert demonstrations. However, supporting novel goals or behaviors through\nreinforcement learning requires the ad-hoc design of appropriate reward\nfunctions, which quickly becomes intractable. To address this challenge, we\npropose Text-Aware Diffusion for Policy Learning (TADPoLe), which uses a\npretrained, frozen text-conditioned diffusion model to compute dense zero-shot\nreward signals for text-aligned policy learning. We hypothesize that\nlarge-scale pretrained generative models encode rich priors that can supervise\na policy to behave not only in a text-aligned manner, but also in alignment\nwith a notion of naturalness summarized from internet-scale training data. In\nour experiments, we demonstrate that TADPoLe is able to learn policies for\nnovel goal-achievement and continuous locomotion behaviors specified by natural\nlanguage, in both Humanoid and Dog environments. The behaviors are learned\nzero-shot without ground-truth rewards or expert demonstrations, and are\nqualitatively more natural according to human evaluation. We further show that\nTADPoLe performs competitively when applied to robotic manipulation tasks in\nthe Meta-World environment.\n","authors":["Calvin Luo","Mandy He","Zilai Zeng","Chen Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00568v2","updated":"2024-07-02T02:54:49Z","published":"2024-06-30T02:50:28Z","title":"Divide And Conquer: Learning Chaotic Dynamical Systems With Multistep\n Penalty Neural Ordinary Differential Equations","summary":" Forecasting high-dimensional dynamical systems is a fundamental challenge in\nvarious fields, such as the geosciences and engineering. Neural Ordinary\nDifferential Equations (NODEs), which combine the power of neural networks and\nnumerical solvers, have emerged as a promising algorithm for forecasting\ncomplex nonlinear dynamical systems. However, classical techniques used for\nNODE training are ineffective for learning chaotic dynamical systems. In this\nwork, we propose a novel NODE-training approach that allows for robust learning\nof chaotic dynamical systems. Our method addresses the challenges of\nnon-convexity and exploding gradients associated with underlying chaotic\ndynamics. Training data trajectories from such systems are split into multiple,\nnon-overlapping time windows. In addition to the deviation from the training\ndata, the optimization loss term further penalizes the discontinuities of the\npredicted trajectory between the time windows. The window size is selected\nbased on the fastest Lyapunov time scale of the system. Multi-step penalty(MP)\nmethod is first demonstrated on Lorenz equation, to illustrate how it improves\nthe loss landscape and thereby accelerating the optimization convergence. MP\nmethod can optimize chaotic systems in a manner similar to least-squares\nshadowing with significantly lower computational costs. Our proposed algorithm,\ndenoted the Multistep Penalty NODE(MP-NODE), is applied to chaotic systems such\nas the Kuramoto-Sivashinsky equation and the two-dimensional Kolmogorov flow.\nIt is observed that MP-NODE provide viable performance for such chaotic\nsystems, not only for short-term trajectory predictions but also for invariant\nstatistics that are hallmarks of the chaotic nature of these dynamics.\n","authors":["Dibyajyoti Chakraborty","Seung Whan Chung","Romit Maulik"],"pdf_url":"https://arxiv.org/pdf/2407.00568v2.pdf","comment":"20 pages, 10 Figures, submitted to Journal of Computational Physics"},{"id":"http://arxiv.org/abs/2406.10787v2","updated":"2024-07-02T02:38:45Z","published":"2024-06-16T03:00:16Z","title":"Evidential Uncertainty Sets in Deep Classifiers Using Conformal\n Prediction","summary":" In this paper, we propose Evidential Conformal Prediction (ECP) method for\nimage classifiers to generate the conformal prediction sets. Our method is\ndesigned based on a non-conformity score function that has its roots in\nEvidential Deep Learning (EDL) as a method of quantifying model (epistemic)\nuncertainty in DNN classifiers. We use evidence that are derived from the logit\nvalues of target labels to compute the components of our non-conformity score\nfunction: the heuristic notion of uncertainty in CP, uncertainty surprisal, and\nexpected utility. Our extensive experimental evaluation demonstrates that ECP\noutperforms three state-of-the-art methods for generating CP sets, in terms of\ntheir set sizes and adaptivity while maintaining the coverage of true labels.\n","authors":["Hamed Karimi","Reza Samavi"],"pdf_url":"https://arxiv.org/pdf/2406.10787v2.pdf","comment":"Accepted in 13th Symposium on Conformal and Probabilistic Prediction\n with Applications (COPA2024). To be published in the Proceedings of Machine\n Learning Research (PMLR), vol. 230, 2024 (24 Pages)"},{"id":"http://arxiv.org/abs/2407.01887v1","updated":"2024-07-02T02:18:14Z","published":"2024-07-02T02:18:14Z","title":"Beyond Numeric Awards: In-Context Dueling Bandits with LLM Agents","summary":" In-context decision-making is an important capability of artificial general\nintelligence, which Large Language Models (LLMs) have effectively demonstrated\nin various scenarios. However, LLMs often face challenges when dealing with\nnumerical contexts, and limited attention has been paid to evaluating their\nperformance through preference feedback generated by the environment. This\npaper investigates the performance of LLMs as decision-makers in the context of\nDueling Bandits (DB). We first evaluate the performance of LLMs by comparing\nGPT-3.5-Turbo, GPT-4, and GPT-4-Turbo against established DB algorithms. Our\nresults reveal that LLMs, particularly GPT-4 Turbo, quickly identify the\nCondorcet winner, thus outperforming existing state-of-the-art algorithms in\nterms of weak regret. Nevertheless, LLMs struggle to converge even when\nexplicitly prompted to do so, and are sensitive to prompt variations. To\novercome these issues, we introduce an LLM-augmented algorithm, IF-Enhanced\nLLM, which takes advantage of both in-context decision-making capabilities of\nLLMs and theoretical guarantees inherited from classic DB algorithms. The\ndesign of such an algorithm sheds light on how to enhance trustworthiness for\nLLMs used in decision-making tasks where performance robustness matters. We\nshow that IF-Enhanced LLM has theoretical guarantees on both weak and strong\nregret. Our experimental results validate that IF-Enhanced LLM is robust even\nwith noisy and adversarial prompts.\n","authors":["Fanzeng Xia","Hao Liu","Yisong Yue","Tongxin Li"],"pdf_url":"https://arxiv.org/pdf/2407.01887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01886v1","updated":"2024-07-02T02:16:43Z","published":"2024-07-02T02:16:43Z","title":"Core Knowledge Learning Framework for Graph Adaptation and Scalability\n Learning","summary":" Graph classification is a pivotal challenge in machine learning, especially\nwithin the realm of graph-based data, given its importance in numerous\nreal-world applications such as social network analysis, recommendation\nsystems, and bioinformatics. Despite its significance, graph classification\nfaces several hurdles, including adapting to diverse prediction tasks, training\nacross multiple target domains, and handling small-sample prediction scenarios.\nCurrent methods often tackle these challenges individually, leading to\nfragmented solutions that lack a holistic approach to the overarching problem.\nIn this paper, we propose an algorithm aimed at addressing the aforementioned\nchallenges. By incorporating insights from various types of tasks, our method\naims to enhance adaptability, scalability, and generalizability in graph\nclassification. Motivated by the recognition that the underlying subgraph plays\na crucial role in GNN prediction, while the remainder is task-irrelevant, we\nintroduce the Core Knowledge Learning (\\method{}) framework for graph\nadaptation and scalability learning. \\method{} comprises several key modules,\nincluding the core subgraph knowledge submodule, graph domain adaptation\nmodule, and few-shot learning module for downstream tasks. Each module is\ntailored to tackle specific challenges in graph classification, such as domain\nshift, label inconsistencies, and data scarcity. By learning the core subgraph\nof the entire graph, we focus on the most pertinent features for task\nrelevance. Consequently, our method offers benefits such as improved model\nperformance, increased domain adaptability, and enhanced robustness to domain\nvariations. Experimental results demonstrate significant performance\nenhancements achieved by our method compared to state-of-the-art approaches.\n","authors":["Bowen Zhang","Zhichao Huang","Genan Dai","Guangning Xu","Xiaomao Fan","Hu Huang"],"pdf_url":"https://arxiv.org/pdf/2407.01886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08787v4","updated":"2024-07-02T02:15:50Z","published":"2024-02-13T20:51:58Z","title":"Rethinking Machine Unlearning for Large Language Models","summary":" We explore machine unlearning (MU) in the domain of large language models\n(LLMs), referred to as LLM unlearning. This initiative aims to eliminate\nundesirable data influence (e.g., sensitive or illegal information) and the\nassociated model capabilities, while maintaining the integrity of essential\nknowledge generation and not affecting causally unrelated information. We\nenvision LLM unlearning becoming a pivotal element in the life-cycle management\nof LLMs, potentially standing as an essential foundation for developing\ngenerative AI that is not only safe, secure, and trustworthy, but also\nresource-efficient without the need of full retraining. We navigate the\nunlearning landscape in LLMs from conceptual formulation, methodologies,\nmetrics, and applications. In particular, we highlight the often-overlooked\naspects of existing LLM unlearning research, e.g., unlearning scope, data-model\ninteraction, and multifaceted efficacy assessment. We also draw connections\nbetween LLM unlearning and related areas such as model editing, influence\nfunctions, model explanation, adversarial training, and reinforcement learning.\nFurthermore, we outline an effective assessment framework for LLM unlearning\nand explore its applications in copyright and privacy safeguards and\nsociotechnical harm reduction.\n","authors":["Sijia Liu","Yuanshun Yao","Jinghan Jia","Stephen Casper","Nathalie Baracaldo","Peter Hase","Yuguang Yao","Chris Yuhao Liu","Xiaojun Xu","Hang Li","Kush R. Varshney","Mohit Bansal","Sanmi Koyejo","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.08787v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15638v2","updated":"2024-07-02T02:09:32Z","published":"2024-02-23T22:46:14Z","title":"Fair Resource Allocation in Multi-Task Learning","summary":" By jointly learning multiple tasks, multi-task learning (MTL) can leverage\nthe shared knowledge across tasks, resulting in improved data efficiency and\ngeneralization performance. However, a major challenge in MTL lies in the\npresence of conflicting gradients, which can hinder the fair optimization of\nsome tasks and subsequently impede MTL's ability to achieve better overall\nperformance. Inspired by fair resource allocation in communication networks, we\nformulate the optimization of MTL as a utility maximization problem, where the\nloss decreases across tasks are maximized under different fairness\nmeasurements. To solve this problem, we propose FairGrad, a novel MTL\noptimization method. FairGrad not only enables flexible emphasis on certain\ntasks but also achieves a theoretical convergence guarantee. Extensive\nexperiments demonstrate that our method can achieve state-of-the-art\nperformance among gradient manipulation methods on a suite of multi-task\nbenchmarks in supervised learning and reinforcement learning. Furthermore, we\nincorporate the idea of $\\alpha$-fairness into loss functions of various MTL\nmethods. Extensive empirical studies demonstrate that their performance can be\nsignificantly enhanced. Code is provided at\n\\url{https://github.com/OptMN-Lab/fairgrad}.\n","authors":["Hao Ban","Kaiyi Ji"],"pdf_url":"https://arxiv.org/pdf/2402.15638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00077v2","updated":"2024-07-02T02:06:17Z","published":"2024-06-22T15:32:53Z","title":"Differentially Private Graph Diffusion with Applications in Personalized\n PageRanks","summary":" Graph diffusion, which iteratively propagates real-valued substances among\nthe graph, is used in numerous graph/network-involved applications. However,\nreleasing diffusion vectors may reveal sensitive linking information in the\ndata such as transaction information in financial network data. However,\nprotecting the privacy of graph data is challenging due to its interconnected\nnature. This work proposes a novel graph diffusion framework with edge-level\ndifferential privacy guarantees by using noisy diffusion iterates. The\nalgorithm injects Laplace noise per diffusion iteration and adopts a\ndegree-based thresholding function to mitigate the high sensitivity induced by\nlow-degree nodes. Our privacy loss analysis is based on Privacy Amplification\nby Iteration (PABI), which to our best knowledge, is the first effort that\nanalyzes PABI with Laplace noise and provides relevant applications. We also\nintroduce a novel Infinity-Wasserstein distance tracking method, which tightens\nthe analysis of privacy leakage and makes PABI more applicable in practice. We\nevaluate this framework by applying it to Personalized Pagerank computation for\nranking tasks. Experiments on real-world network data demonstrate the\nsuperiority of our method under stringent privacy conditions.\n","authors":["Rongzhe Wei","Eli Chien","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2407.00077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12963v2","updated":"2024-07-02T01:52:26Z","published":"2024-01-23T18:45:54Z","title":"AutoRT: Embodied Foundation Models for Large Scale Orchestration of\n Robotic Agents","summary":" Foundation models that incorporate language, vision, and more recently\nactions have revolutionized the ability to harness internet scale data to\nreason about useful tasks. However, one of the key challenges of training\nembodied foundation models is the lack of data grounded in the physical world.\nIn this paper, we propose AutoRT, a system that leverages existing foundation\nmodels to scale up the deployment of operational robots in completely unseen\nscenarios with minimal human supervision. AutoRT leverages vision-language\nmodels (VLMs) for scene understanding and grounding, and further uses large\nlanguage models (LLMs) for proposing diverse and novel instructions to be\nperformed by a fleet of robots. Guiding data collection by tapping into the\nknowledge of foundation models enables AutoRT to effectively reason about\nautonomy tradeoffs and safety while significantly scaling up data collection\nfor robot learning. We demonstrate AutoRT proposing instructions to over 20\nrobots across multiple buildings and collecting 77k real robot episodes via\nboth teleoperation and autonomous robot policies. We experimentally show that\nsuch \"in-the-wild\" data collected by AutoRT is significantly more diverse, and\nthat AutoRT's use of LLMs allows for instruction following data collection\nrobots that can align to human preferences.\n","authors":["Michael Ahn","Debidatta Dwibedi","Chelsea Finn","Montse Gonzalez Arenas","Keerthana Gopalakrishnan","Karol Hausman","Brian Ichter","Alex Irpan","Nikhil Joshi","Ryan Julian","Sean Kirmani","Isabel Leal","Edward Lee","Sergey Levine","Yao Lu","Isabel Leal","Sharath Maddineni","Kanishka Rao","Dorsa Sadigh","Pannag Sanketi","Pierre Sermanet","Quan Vuong","Stefan Welker","Fei Xia","Ted Xiao","Peng Xu","Steve Xu","Zhuo Xu"],"pdf_url":"https://arxiv.org/pdf/2401.12963v2.pdf","comment":"26 pages, 9 figures, ICRA 2024 VLMNM Workshop"},{"id":"http://arxiv.org/abs/2312.02327v2","updated":"2024-07-02T01:37:34Z","published":"2023-12-04T20:24:09Z","title":"FLea: Addressing Data Scarcity and Label Skew in Federated Learning via\n Privacy-preserving Feature Augmentation","summary":" Federated Learning (FL) enables model development by leveraging data\ndistributed across numerous edge devices without transferring local data to a\ncentral server. However, existing FL methods still face challenges when dealing\nwith scarce and label-skewed data across devices, resulting in local model\noverfitting and drift, consequently hindering the performance of the global\nmodel. In response to these challenges, we propose a pioneering framework\ncalled \\textit{FLea}, incorporating the following key components: \\textit{i)} A\nglobal feature buffer that stores activation-target pairs shared from multiple\nclients to support local training. This design mitigates local model drift\ncaused by the absence of certain classes; \\textit{ii)} A feature augmentation\napproach based on local and global activation mix-ups for local training. This\nstrategy enlarges the training samples, thereby reducing the risk of local\noverfitting; \\textit{iii)} An obfuscation method to minimize the correlation\nbetween intermediate activations and the source data, enhancing the privacy of\nshared features. To verify the superiority of \\textit{FLea}, we conduct\nextensive experiments using a wide range of data modalities, simulating\ndifferent levels of local data scarcity and label skew. The results demonstrate\nthat \\textit{FLea} consistently outperforms state-of-the-art FL counterparts\n(among 13 of the experimented 18 settings, the improvement is over $5\\%$) while\nconcurrently mitigating the privacy vulnerabilities associated with shared\nfeatures. Code is available at https://github.com/XTxiatong/FLea.git\n","authors":["Tong Xia","Abhirup Ghosh","Xinchi Qiu","Cecilia Mascolo"],"pdf_url":"https://arxiv.org/pdf/2312.02327v2.pdf","comment":"This paper has been acceped by KDD'24"},{"id":"http://arxiv.org/abs/2407.01873v1","updated":"2024-07-02T01:17:01Z","published":"2024-07-02T01:17:01Z","title":"Automated Text Scoring in the Age of Generative AI for the GPU-poor","summary":" Current research on generative language models (GLMs) for automated text\nscoring (ATS) has focused almost exclusively on querying proprietary models via\nApplication Programming Interfaces (APIs). Yet such practices raise issues\naround transparency and security, and these methods offer little in the way of\nefficiency or customizability. With the recent proliferation of smaller,\nopen-source models, there is the option to explore GLMs with computers equipped\nwith modest, consumer-grade hardware, that is, for the \"GPU poor.\" In this\nstudy, we analyze the performance and efficiency of open-source, small-scale\nGLMs for ATS. Results show that GLMs can be fine-tuned to achieve adequate,\nthough not state-of-the-art, performance. In addition to ATS, we take small\nsteps towards analyzing models' capacity for generating feedback by prompting\nGLMs to explain their scores. Model-generated feedback shows promise, but\nrequires more rigorous evaluation focused on targeted use cases.\n","authors":["Christopher Michael Ormerod","Alexander Kwako"],"pdf_url":"https://arxiv.org/pdf/2407.01873v1.pdf","comment":"21 pages, 1 figure"},{"id":"http://arxiv.org/abs/2407.01869v1","updated":"2024-07-02T01:05:35Z","published":"2024-07-02T01:05:35Z","title":"Let it shine: Autofluorescence of Papanicolaou-stain improves AI-based\n cytological oral cancer detection","summary":" Oral cancer is a global health challenge. It is treatable if detected early,\nbut it is often fatal in late stages. There is a shift from the invasive and\ntime-consuming tissue sampling and histological examination, toward\nnon-invasive brush biopsies and cytological examination. Reliable\ncomputer-assisted methods are essential for cost-effective and accurate\ncytological analysis, but the lack of detailed cell-level annotations impairs\nmodel effectiveness. This study aims to improve AI-based oral cancer detection\nusing multimodal imaging and deep fusion. We combine brightfield and\nfluorescence whole slide microscopy imaging to analyze Papanicolaou-stained\nliquid-based cytology slides of brush biopsies collected from both healthy and\ncancer patients. Due to limited cytological annotations, we utilize a weakly\nsupervised deep learning approach using only patient-level labels. We evaluate\nvarious multimodal fusion strategies, including early, late, and three recent\nintermediate fusion methods. Our results show: (i) fluorescence imaging of\nPapanicolaou-stained samples provides substantial diagnostic information; (ii)\nmultimodal fusion enhances classification and cancer detection accuracy over\nsingle-modality methods. Intermediate fusion is the leading method among the\nstudied approaches. Specifically, the Co-Attention Fusion Network (CAFNet)\nmodel excels with an F1 score of 83.34% and accuracy of 91.79%, surpassing\nhuman performance on the task. Additional tests highlight the need for precise\nimage registration to optimize multimodal analysis benefits. This study\nadvances cytopathology by combining deep learning and multimodal imaging to\nenhance early, non-invasive detection of oral cancer, improving diagnostic\naccuracy and streamlining clinical workflows. The developed pipeline is also\napplicable in other cytological settings. Our codes and dataset are available\nonline for further research.\n","authors":["Wenyi Lian","Joakim Lindblad","Christina Runow Stark","Jan-Michaél Hirsch","Nataša Sladoje"],"pdf_url":"https://arxiv.org/pdf/2407.01869v1.pdf","comment":"16 pages, 12 figures, 11 tables"},{"id":"http://arxiv.org/abs/2407.01864v1","updated":"2024-07-02T00:43:41Z","published":"2024-07-02T00:43:41Z","title":"Research on target detection method of distracted driving behavior based\n on improved YOLOv8","summary":" With the development of deep learning technology, the detection and\nclassification of distracted driving behaviour requires higher accuracy.\nExisting deep learning-based methods are computationally intensive and\nparameter redundant, limiting the efficiency and accuracy in practical\napplications. To solve this problem, this study proposes an improved YOLOv8\ndetection method based on the original YOLOv8 model by integrating the BoTNet\nmodule, GAM attention mechanism and EIoU loss function. By optimising the\nfeature extraction and multi-scale feature fusion strategies, the training and\ninference processes are simplified, and the detection accuracy and efficiency\nare significantly improved. Experimental results show that the improved model\nperforms well in both detection speed and accuracy, with an accuracy rate of\n99.4%, and the model is smaller and easy to deploy, which is able to identify\nand classify distracted driving behaviours in real time, provide timely\nwarnings, and enhance driving safety.\n","authors":["Shiquan Shen","Zhizhong Wu","Pan Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18237v3","updated":"2024-07-02T00:22:16Z","published":"2023-11-30T04:07:44Z","title":"Knowledge Transfer from Vision Foundation Models for Efficient Training\n of Small Task-specific Models","summary":" Vision Foundation Models (VFMs) pretrained on massive datasets exhibit\nimpressive performance on various downstream tasks, especially with limited\nlabeled target data. However, due to their high inference compute cost, these\nmodels cannot be deployed for many real-world applications. Motivated by this,\nwe ask the following important question, \"How can we leverage the knowledge\nfrom a large VFM to train a small task-specific model for a new target task\nwith limited labeled training data?\", and propose a simple task-oriented\nknowledge transfer approach as a highly effective solution to this problem. Our\nexperimental results on five target tasks show that the proposed approach\noutperforms task-agnostic VFM distillation, web-scale CLIP pretraining,\nsupervised ImageNet pretraining, and self-supervised DINO pretraining by up to\n11.6%, 22.1%, 13.7%, and 29.8%, respectively. Furthermore, the proposed\napproach also demonstrates up to 9x, 4x and 15x reduction in pretraining\ncompute cost when compared to task-agnostic VFM distillation, ImageNet\npretraining and DINO pretraining, respectively, while outperforming them. We\nalso show that the dataset used for transferring knowledge has a significant\neffect on the final target task performance, and introduce a\nretrieval-augmented knowledge transfer strategy that uses web-scale image\nretrieval to curate effective transfer sets.\n","authors":["Raviteja Vemulapalli","Hadi Pouransari","Fartash Faghri","Sachin Mehta","Mehrdad Farajtabar","Mohammad Rastegari","Oncel Tuzel"],"pdf_url":"https://arxiv.org/pdf/2311.18237v3.pdf","comment":"International Conference on Machine Learning, 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.02457v1","updated":"2024-07-02T17:33:37Z","published":"2024-07-02T17:33:37Z","title":"Volume Tracking Based Reference Mesh Extraction for Time-Varying Mesh\n Compression","summary":" Time-Varying meshes (TVMs), characterized by their varying connectivity and\nnumber of vertices, hold significant potential in immersive media and other\nvarious applications. However, their practical utilization is challenging due\nto their time-varying features and large file sizes. Creating a reference mesh\nthat contains the most essential information is a promising approach to\nutilizing shared information within TVMs to reduce storage and transmission\ncosts. We propose a novel method that employs volume tracking to extract\nreference meshes. First, we adopt as-rigid-as-possible (ARAP) volume tracking\non TVMs to get the volume centers for each mesh. Then, we use multidimensional\nscaling (MDS) to get reference centers that ensure the reference mesh avoids\nself-contact regions. Finally, we map the vertices of the meshes to reference\ncenters and extract the reference mesh. Our approach offers a feasible solution\nfor extracting reference meshes that can serve multiple purposes such as\nestablishing surface correspondence, deforming the reference mesh to different\nshapes for I-frame based mesh compression, or defining the global shape of the\nTVMs.\n","authors":["Guodong Chen","Libor Vasa","Fulin Wang","Mallesham Dasari"],"pdf_url":"https://arxiv.org/pdf/2407.02457v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2407.02411v1","updated":"2024-07-02T16:34:14Z","published":"2024-07-02T16:34:14Z","title":"Video Watermarking: Safeguarding Your Video from (Unauthorized)\n Annotations by Video-based LLMs","summary":" The advent of video-based Large Language Models (LLMs) has significantly\nenhanced video understanding. However, it has also raised some safety concerns\nregarding data protection, as videos can be more easily annotated, even without\nauthorization. This paper introduces Video Watermarking, a novel technique to\nprotect videos from unauthorized annotations by such video-based LLMs,\nespecially concerning the video content and description, in response to\nspecific queries. By imperceptibly embedding watermarks into key video frames\nwith multi-modal flow-based losses, our method preserves the viewing experience\nwhile preventing misuse by video-based LLMs. Extensive experiments show that\nVideo Watermarking significantly reduces the comprehensibility of videos with\nvarious video-based LLMs, demonstrating both stealth and robustness. In\nessence, our method provides a solution for securing video content, ensuring\nits integrity and confidentiality in the face of evolving video-based LLMs\ntechnologies.\n","authors":["Jinmin Li","Kuofeng Gao","Yang Bai","Jingyun Zhang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2407.02411v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2403.13507"},{"id":"http://arxiv.org/abs/2407.02389v1","updated":"2024-07-02T16:02:25Z","published":"2024-07-02T16:02:25Z","title":"SafaRi:Adaptive Sequence Transformer for Weakly Supervised Referring\n Expression Segmentation","summary":" Referring Expression Segmentation (RES) aims to provide a segmentation mask\nof the target object in an image referred to by the text (i.e., referring\nexpression). Existing methods require large-scale mask annotations. Moreover,\nsuch approaches do not generalize well to unseen/zero-shot scenarios. To\naddress the aforementioned issues, we propose a weakly-supervised bootstrapping\narchitecture for RES with several new algorithmic innovations. To the best of\nour knowledge, ours is the first approach that considers only a fraction of\nboth mask and box annotations (shown in Figure 1 and Table 1) for training. To\nenable principled training of models in such low-annotation settings, improve\nimage-text region-level alignment, and further enhance spatial localization of\nthe target object in the image, we propose Cross-modal Fusion with Attention\nConsistency module. For automatic pseudo-labeling of unlabeled samples, we\nintroduce a novel Mask Validity Filtering routine based on a spatially aware\nzero-shot proposal scoring approach. Extensive experiments show that with just\n30% annotations, our model SafaRi achieves 59.31 and 48.26 mIoUs as compared to\n58.93 and 48.19 mIoUs obtained by the fully-supervised SOTA method SeqTR\nrespectively on RefCOCO+@testA and RefCOCO+testB datasets. SafaRi also\noutperforms SeqTR by 11.7% (on RefCOCO+testA) and 19.6% (on RefCOCO+testB) in a\nfully-supervised setting and demonstrates strong generalization capabilities in\nunseen/zero-shot tasks.\n","authors":["Sayan Nag","Koustava Goswami","Srikrishna Karanam"],"pdf_url":"https://arxiv.org/pdf/2407.02389v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2401.09759v2","updated":"2024-07-02T13:43:59Z","published":"2024-01-18T07:19:10Z","title":"SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech\n Recognition","summary":" Audio-visual speech recognition (AVSR) is a multimodal extension of automatic\nspeech recognition (ASR), using video as a complement to audio. In AVSR,\nconsiderable efforts have been directed at datasets for facial features such as\nlip-readings, while they often fall short in evaluating the image comprehension\ncapabilities in broader contexts. In this paper, we construct SlideAVSR, an\nAVSR dataset using scientific paper explanation videos. SlideAVSR provides a\nnew benchmark where models transcribe speech utterances with texts on the\nslides on the presentation recordings. As technical terminologies that are\nfrequent in paper explanations are notoriously challenging to transcribe\nwithout reference texts, our SlideAVSR dataset spotlights a new aspect of AVSR\nproblems. As a simple yet effective baseline, we propose DocWhisper, an AVSR\nmodel that can refer to textual information from slides, and confirm its\neffectiveness on SlideAVSR.\n","authors":["Hao Wang","Shuhei Kurita","Shuichiro Shimizu","Daisuke Kawahara"],"pdf_url":"https://arxiv.org/pdf/2401.09759v2.pdf","comment":"3rd Workshop on Advances in Language and Vision Research (ALVR 2024)"},{"id":"http://arxiv.org/abs/2407.02104v1","updated":"2024-07-02T09:43:47Z","published":"2024-07-02T09:43:47Z","title":"Joint-Dataset Learning and Cross-Consistent Regularization for\n Text-to-Motion Retrieval","summary":" Pose-estimation methods enable extracting human motion from common videos in\nthe structured form of 3D skeleton sequences. Despite great application\nopportunities, effective content-based access to such spatio-temporal motion\ndata is a challenging problem. In this paper, we focus on the recently\nintroduced text-motion retrieval tasks, which aim to search for database\nmotions that are the most relevant to a specified natural-language textual\ndescription (text-to-motion) and vice-versa (motion-to-text). Despite recent\nefforts to explore these promising avenues, a primary challenge remains the\ninsufficient data available to train robust text-motion models effectively. To\naddress this issue, we propose to investigate joint-dataset learning - where we\ntrain on multiple text-motion datasets simultaneously - together with the\nintroduction of a Cross-Consistent Contrastive Loss function (CCCL), which\nregularizes the learned text-motion common space by imposing uni-modal\nconstraints that augment the representation ability of the trained network. To\nlearn a proper motion representation, we also introduce a transformer-based\nmotion encoder, called MoT++, which employs spatio-temporal attention to\nprocess sequences of skeleton data. We demonstrate the benefits of the proposed\napproaches on the widely-used KIT Motion-Language and HumanML3D datasets. We\nperform detailed experimentation on joint-dataset learning and cross-dataset\nscenarios, showing the effectiveness of each introduced module in a carefully\nconducted ablation study and, in turn, pointing out the limitations of\nstate-of-the-art methods.\n","authors":["Nicola Messina","Jan Sedmidubsky","Fabrizio Falchi","Tomáš Rebok"],"pdf_url":"https://arxiv.org/pdf/2407.02104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02040v1","updated":"2024-07-02T08:12:14Z","published":"2024-07-02T08:12:14Z","title":"ScaleDreamer: Scalable Text-to-3D Synthesis with Asynchronous Score\n Distillation","summary":" By leveraging the text-to-image diffusion priors, score distillation can\nsynthesize 3D contents without paired text-3D training data. Instead of\nspending hours of online optimization per text prompt, recent studies have been\nfocused on learning a text-to-3D generative network for amortizing multiple\ntext-3D relations, which can synthesize 3D contents in seconds. However,\nexisting score distillation methods are hard to scale up to a large amount of\ntext prompts due to the difficulties in aligning pretrained diffusion prior\nwith the distribution of rendered images from various text prompts. Current\nstate-of-the-arts such as Variational Score Distillation finetune the\npretrained diffusion model to minimize the noise prediction error so as to\nalign the distributions, which are however unstable to train and will impair\nthe model's comprehension capability to numerous text prompts. Based on the\nobservation that the diffusion models tend to have lower noise prediction\nerrors at earlier timesteps, we propose Asynchronous Score Distillation (ASD),\nwhich minimizes the noise prediction error by shifting the diffusion timestep\nto earlier ones. ASD is stable to train and can scale up to 100k prompts. It\nreduces the noise prediction error without changing the weights of pre-trained\ndiffusion model, thus keeping its strong comprehension capability to prompts.\nWe conduct extensive experiments across different 2D diffusion models,\nincluding Stable Diffusion and MVDream, and text-to-3D generators, including\nHyper-iNGP, 3DConv-Net and Triplane-Transformer. The results demonstrate ASD's\neffectiveness in stable 3D generator training, high-quality 3D content\nsynthesis, and its superior prompt-consistency, especially under large prompt\ncorpus.\n","authors":["Zhiyuan Ma","Yuxiang Wei","Yabin Zhang","Xiangyu Zhu","Zhen Lei","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02040v1.pdf","comment":"Accepted by ECCV 2024. Code available at\n https://github.com/theEricMa/ScaleDreamer"},{"id":"http://arxiv.org/abs/2407.01976v1","updated":"2024-07-02T06:29:05Z","published":"2024-07-02T06:29:05Z","title":"A Bounding Box is Worth One Token: Interleaving Layout and Text in a\n Large Language Model for Document Understanding","summary":" Recently, many studies have demonstrated that exclusively incorporating\nOCR-derived text and spatial layouts with large language models (LLMs) can be\nhighly effective for document understanding tasks. However, existing methods\nthat integrate spatial layouts with text have limitations, such as producing\noverly long text sequences or failing to fully leverage the autoregressive\ntraits of LLMs. In this work, we introduce Interleaving Layout and Text in a\nLarge Language Model (LayTextLLM)} for document understanding. In particular,\nLayTextLLM projects each bounding box to a single embedding and interleaves it\nwith text, efficiently avoiding long sequence issues while leveraging\nautoregressive traits of LLMs. LayTextLLM not only streamlines the interaction\nof layout and textual data but also shows enhanced performance in Key\nInformation Extraction (KIE) and Visual Question Answering (VQA). Comprehensive\nbenchmark evaluations reveal significant improvements, with a 27.0% increase on\nKIE tasks and 24.1% on VQA tasks compared to previous state-of-the-art document\nunderstanding MLLMs, as well as a 15.5% improvement over other SOTA OCR-based\nLLMs on KIE tasks.\n","authors":["Jinghui Lu","Haiyang Yu","Yanjie Wang","Yongjie Ye","Jingqun Tang","Ziwei Yang","Binghong Wu","Qi Liu","Hao Feng","Han Wang","Hao Liu","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2407.01976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01920v1","updated":"2024-07-02T03:34:16Z","published":"2024-07-02T03:34:16Z","title":"To Forget or Not? Towards Practical Knowledge Unlearning for Large\n Language Models","summary":" Large Language Models (LLMs) trained on extensive corpora inevitably retain\nsensitive data, such as personal privacy information and copyrighted material.\nRecent advancements in knowledge unlearning involve updating LLM parameters to\nerase specific knowledge. However, current unlearning paradigms are mired in\nvague forgetting boundaries, often erasing knowledge indiscriminately. In this\nwork, we introduce KnowUnDo, a benchmark containing copyrighted content and\nuser privacy domains to evaluate if the unlearning process inadvertently erases\nessential knowledge. Our findings indicate that existing unlearning methods\noften suffer from excessive unlearning. To address this, we propose a simple\nyet effective method, MemFlex, which utilizes gradient information to precisely\ntarget and unlearn sensitive parameters. Experimental results show that MemFlex\nis superior to existing methods in both precise knowledge unlearning and\ngeneral knowledge retaining of LLMs. Code and dataset will be released at\nhttps://github.com/zjunlp/KnowUnDo.\n","authors":["Bozhong Tian","Xiaozhuan Liang","Siyuan Cheng","Qingbin Liu","Mengru Wang","Dianbo Sui","Xi Chen","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01920v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2404.06022v2","updated":"2024-07-02T01:19:01Z","published":"2024-04-09T05:11:28Z","title":"Band-Attention Modulated RetNet for Face Forgery Detection","summary":" The transformer networks are extensively utilized in face forgery detection\ndue to their scalability across large datasets.Despite their success,\ntransformers face challenges in balancing the capture of global context, which\nis crucial for unveiling forgery clues, with computational complexity.To\nmitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a\nlightweight network designed to efficiently process extensive visual contexts\nwhile avoiding catastrophic forgetting.Our approach empowers the target token\nto perceive global information by assigning differential attention levels to\ntokens at varying distances. We implement self-attention along both spatial\naxes, thereby maintaining spatial priors and easing the computational\nburden.Moreover, we present the adaptive frequency Band-Attention Modulation\nmechanism, which treats the entire Discrete Cosine Transform spectrogram as a\nseries of frequency bands with learnable weights.Together, BAR-Net achieves\nfavorable performance on several face forgery datasets, outperforming current\nstate-of-the-art methods.\n","authors":["Zhida Zhang","Jie Cao","Wenkui Yang","Qihang Fan","Kai Zhou","Ran He"],"pdf_url":"https://arxiv.org/pdf/2404.06022v2.pdf","comment":"The essay is poorly expressed in writing and will be re-optimised"}]},"2024-07-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.03321v1","updated":"2024-07-03T17:59:53Z","published":"2024-07-03T17:59:53Z","title":"Planetarium: A Rigorous Benchmark for Translating Text to Structured\n Planning Languages","summary":" Many recent works have explored using language models for planning problems.\nOne line of research focuses on translating natural language descriptions of\nplanning tasks into structured planning languages, such as the planning domain\ndefinition language (PDDL). While this approach is promising, accurately\nmeasuring the quality of generated PDDL code continues to pose significant\nchallenges. First, generated PDDL code is typically evaluated using planning\nvalidators that check whether the problem can be solved with a planner. This\nmethod is insufficient because a language model might generate valid PDDL code\nthat does not align with the natural language description of the task. Second,\nexisting evaluation sets often have natural language descriptions of the\nplanning task that closely resemble the ground truth PDDL, reducing the\nchallenge of the task. To bridge this gap, we introduce \\benchmarkName, a\nbenchmark designed to evaluate language models' ability to generate PDDL code\nfrom natural language descriptions of planning tasks. We begin by creating a\nPDDL equivalence algorithm that rigorously evaluates the correctness of PDDL\ncode generated by language models by flexibly comparing it against a ground\ntruth PDDL. Then, we present a dataset of $132,037$ text-to-PDDL pairs across\n13 different tasks, with varying levels of difficulty. Finally, we evaluate\nseveral API-access and open-weight language models that reveal this task's\ncomplexity. For example, $87.6\\%$ of the PDDL problem descriptions generated by\nGPT-4o are syntactically parseable, $82.2\\%$ are valid, solve-able problems,\nbut only $35.1\\%$ are semantically correct, highlighting the need for a more\nrigorous benchmark for this problem.\n","authors":["Max Zuo","Francisco Piedrahita Velez","Xiaochen Li","Michael L. Littman","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2407.03321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03320v1","updated":"2024-07-03T17:59:21Z","published":"2024-07-03T17:59:21Z","title":"InternLM-XComposer-2.5: A Versatile Large Vision Language Model\n Supporting Long-Contextual Input and Output","summary":" We present InternLM-XComposer-2.5 (IXC-2.5), a versatile large-vision\nlanguage model that supports long-contextual input and output. IXC-2.5 excels\nin various text-image comprehension and composition applications, achieving\nGPT-4V level capabilities with merely 7B LLM backend. Trained with 24K\ninterleaved image-text contexts, it can seamlessly extend to 96K long contexts\nvia RoPE extrapolation. This long-context capability allows IXC-2.5 to excel in\ntasks requiring extensive input and output contexts. Compared to its previous\n2.0 version, InternLM-XComposer-2.5 features three major upgrades in\nvision-language comprehension: (1) Ultra-High Resolution Understanding, (2)\nFine-Grained Video Understanding, and (3) Multi-Turn Multi-Image Dialogue. In\naddition to comprehension, IXC-2.5 extends to two compelling applications using\nextra LoRA parameters for text-image composition: (1) Crafting Webpages and (2)\nComposing High-Quality Text-Image Articles. IXC-2.5 has been evaluated on 28\nbenchmarks, outperforming existing open-source state-of-the-art models on 16\nbenchmarks. It also surpasses or competes closely with GPT-4V and Gemini Pro on\n16 key tasks. The InternLM-XComposer-2.5 is publicly available at\nhttps://github.com/InternLM/InternLM-XComposer.\n","authors":["Pan Zhang","Xiaoyi Dong","Yuhang Zang","Yuhang Cao","Rui Qian","Lin Chen","Qipeng Guo","Haodong Duan","Bin Wang","Linke Ouyang","Songyang Zhang","Wenwei Zhang","Yining Li","Yang Gao","Peng Sun","Xinyue Zhang","Wei Li","Jingwen Li","Wenhai Wang","Hang Yan","Conghui He","Xingcheng Zhang","Kai Chen","Jifeng Dai","Yu Qiao","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03320v1.pdf","comment":"Technical Report. https://github.com/InternLM/InternLM-XComposer"},{"id":"http://arxiv.org/abs/2407.03314v1","updated":"2024-07-03T17:55:27Z","published":"2024-07-03T17:55:27Z","title":"BACON: Supercharge Your VLM with Bag-of-Concept Graph to Mitigate\n Hallucinations","summary":" This paper presents Bag-of-Concept Graph (BACON) to gift models with limited\nlinguistic abilities to taste the privilege of Vision Language Models (VLMs)\nand boost downstream tasks such as detection, visual question answering (VQA),\nand image generation. Since the visual scenes in physical worlds are structured\nwith complex relations between objects, BACON breaks down annotations into\nbasic minimum elements and presents them in a graph structure. Element-wise\nstyle enables easy understanding, and structural composition liberates\ndifficult locating. Careful prompt design births the BACON captions with the\nhelp of public-available VLMs and segmentation methods. In this way, we gather\na dataset with 100K annotated images, which endow VLMs with remarkable\ncapabilities, such as accurately generating BACON, transforming prompts into\nBACON format, envisioning scenarios in the style of BACONr, and dynamically\nmodifying elements within BACON through interactive dialogue and more. Wide\nrepresentative experiments, including detection, VQA, and image generation\ntasks, tell BACON as a lifeline to achieve previous out-of-reach tasks or excel\nin their current cutting-edge solutions.\n","authors":["Zhantao Yang","Ruili Feng","Keyu Yan","Huangji Wang","Zhicai Wang","Shangwen Zhu","Han Zhang","Jie Xiao","Pingyu Wu","Kai Zhu","Jixuan Chen","Chen-Wei Xie","Chaojie Mao","Yue Yang","Hongyang Zhang","Yu Liu","Fan Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.03314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05880v2","updated":"2024-07-03T17:52:00Z","published":"2024-04-08T21:26:22Z","title":"Eraser: Jailbreaking Defense in Large Language Models via Unlearning\n Harmful Knowledge","summary":" Jailbreaking attacks can enable Large Language Models (LLMs) to bypass the\nsafeguard and generate harmful content. Existing jailbreaking defense methods\nhave failed to address the fundamental issue that harmful knowledge resides\nwithin the model, leading to potential jailbreak risks for LLMs. In this paper,\nwe propose a novel defense method called Eraser, which mainly includes three\ngoals: unlearning harmful knowledge, retaining general knowledge, and\nmaintaining safety alignment. The intuition is that if an LLM forgets the\nspecific knowledge required to answer a harmful question, it will no longer\nhave the ability to answer harmful questions. The training of Erase does not\nactually require the model's own harmful knowledge, and it can benefit from\nunlearning general answers related to harmful queries, which means it does not\nneed assistance from the red team. The experimental results show that Eraser\ncan significantly reduce the jailbreaking success rate for various attacks\nwithout compromising the general capabilities of the model. Our codes are\navailable at https://github.com/ZeroNLP/Eraser.\n","authors":["Weikai Lu","Ziqian Zeng","Jianwei Wang","Zhengdong Lu","Zelin Chen","Huiping Zhuang","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03302v1","updated":"2024-07-03T17:43:54Z","published":"2024-07-03T17:43:54Z","title":"A Review of the Applications of Deep Learning-Based Emergent\n Communication","summary":" Emergent communication, or emergent language, is the field of research which\nstudies how human language-like communication systems emerge de novo in deep\nmulti-agent reinforcement learning environments. The possibilities of\nreplicating the emergence of a complex behavior like language have strong\nintuitive appeal, yet it is necessary to complement this with clear notions of\nhow such research can be applicable to other fields of science, technology, and\nengineering. This paper comprehensively reviews the applications of emergent\ncommunication research across machine learning, natural language processing,\nlinguistics, and cognitive science. Each application is illustrated with a\ndescription of its scope, an explication of emergent communication's unique\nrole in addressing it, a summary of the extant literature working towards the\napplication, and brief recommendations for near-term research directions.\n","authors":["Brendon Boldt","David Mortensen"],"pdf_url":"https://arxiv.org/pdf/2407.03302v1.pdf","comment":"49 pages, 15 figures"},{"id":"http://arxiv.org/abs/2406.16008v2","updated":"2024-07-03T17:40:00Z","published":"2024-06-23T04:35:42Z","title":"Found in the Middle: Calibrating Positional Attention Bias Improves Long\n Context Utilization","summary":" Large language models (LLMs), even when specifically trained to process long\ninput contexts, struggle to capture relevant information located in the middle\nof their input. This phenomenon has been known as the lost-in-the-middle\nproblem. In this work, we make three contributions. First, we set out to\nunderstand the factors that cause this phenomenon. In doing so, we establish a\nconnection between lost-in-the-middle to LLMs' intrinsic attention bias: LLMs\nexhibit a U-shaped attention bias where the tokens at the beginning and at the\nend of its input receive higher attention, regardless of their relevance.\nSecond, we mitigate this positional bias through a calibration mechanism,\nfound-in-the-middle, that allows the model to attend to contexts faithfully\naccording to their relevance, even though when they are in the middle. Third,\nwe show found-in-the-middle not only achieves better performance in locating\nrelevant information within a long context, but also eventually leads to\nimproved retrieval-augmented generation (RAG) performance across various tasks,\noutperforming existing methods by up to 15 percentage points. These findings\nopen up future directions in understanding LLM attention bias and its potential\nconsequences.\n","authors":["Cheng-Yu Hsieh","Yung-Sung Chuang","Chun-Liang Li","Zifeng Wang","Long T. Le","Abhishek Kumar","James Glass","Alexander Ratner","Chen-Yu Lee","Ranjay Krishna","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2406.16008v2.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2405.15232v2","updated":"2024-07-03T17:30:53Z","published":"2024-05-24T05:46:04Z","title":"DEEM: Diffusion Models Serve as the Eyes of Large Language Models for\n Image Perception","summary":" The development of large language models (LLMs) has significantly advanced\nthe emergence of large multimodal models (LMMs). While LMMs have achieved\ntremendous success by promoting the synergy between multimodal comprehension\nand creation, they often face challenges when confronted with\nout-of-distribution data. This is primarily due to their reliance on image\nencoders trained to encode images into task-relevant features, which may lead\nthem to disregard irrelevant details. Delving into the modeling capabilities of\ndiffusion models for images naturally prompts the question: Can diffusion\nmodels serve as the eyes of large language models for image perception? In this\npaper, we propose DEEM, a simple and effective approach that utilizes the\ngenerative feedback of diffusion models to align the semantic distributions of\nthe image encoder. This addresses the drawbacks of previous methods that solely\nrelied on image encoders like ViT, thereby enhancing the model's resilience\nagainst out-of-distribution samples and reducing visual hallucinations.\nImportantly, this is achieved without requiring additional training modules and\nwith fewer training parameters. We extensively evaluated DEEM on both our newly\nconstructed RobustVQA benchmark and another well-known benchmark, POPE, for\nobject hallucination. Compared to the state-of-the-art interleaved content\ngeneration models, DEEM exhibits enhanced robustness and a superior capacity to\nalleviate model hallucinations while utilizing fewer trainable parameters, less\npre-training data (10%), and a smaller base model size.\n","authors":["Run Luo","Yunshui Li","Longze Chen","Wanwei He","Ting-En Lin","Ziqiang Liu","Lei Zhang","Zikai Song","Xiaobo Xia","Tongliang Liu","Min Yang","Binyuan Hui"],"pdf_url":"https://arxiv.org/pdf/2405.15232v2.pdf","comment":"25 pages. arXiv admin note: text overlap with arXiv:2401.10208 by\n other authors"},{"id":"http://arxiv.org/abs/2407.03282v1","updated":"2024-07-03T17:08:52Z","published":"2024-07-03T17:08:52Z","title":"LLM Internal States Reveal Hallucination Risk Faced With a Query","summary":" The hallucination problem of Large Language Models (LLMs) significantly\nlimits their reliability and trustworthiness. Humans have a self-awareness\nprocess that allows us to recognize what we don't know when faced with queries.\nInspired by this, our paper investigates whether LLMs can estimate their own\nhallucination risk before response generation. We analyze the internal\nmechanisms of LLMs broadly both in terms of training data sources and across 15\ndiverse Natural Language Generation (NLG) tasks, spanning over 700 datasets.\nOur empirical analysis reveals two key insights: (1) LLM internal states\nindicate whether they have seen the query in training data or not; and (2) LLM\ninternal states show they are likely to hallucinate or not regarding the query.\nOur study explores particular neurons, activation layers, and tokens that play\na crucial role in the LLM perception of uncertainty and hallucination risk. By\na probing estimator, we leverage LLM self-assessment, achieving an average\nhallucination estimation accuracy of 84.32\\% at run time.\n","authors":["Ziwei Ji","Delong Chen","Etsuko Ishii","Samuel Cahyawijaya","Yejin Bang","Bryan Wilie","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2407.03282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03277v1","updated":"2024-07-03T17:04:17Z","published":"2024-07-03T17:04:17Z","title":"Evaluating Automatic Metrics with Incremental Machine Translation\n Systems","summary":" We introduce a dataset comprising commercial machine translations, gathered\nweekly over six years across 12 translation directions. Since human A/B testing\nis commonly used, we assume commercial systems improve over time, which enables\nus to evaluate machine translation (MT) metrics based on their preference for\nmore recent translations. Our study confirms several previous findings in MT\nmetrics research and demonstrates the dataset's value as a testbed for metric\nevaluation. We release our code at https://github.com/gjwubyron/Evo\n","authors":["Guojun Wu","Shay B. Cohen","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2407.03277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03255v1","updated":"2024-07-03T16:36:26Z","published":"2024-07-03T16:36:26Z","title":"How Similar Are Elected Politicians and Their Constituents? Quantitative\n Evidence From Online Social Network","summary":" How similar are politicians to those who vote for them? This is a critical\nquestion at the heart of democratic representation and particularly relevant at\ntimes when political dissatisfaction and populism are on the rise. To answer\nthis question we compare the online discourse of elected politicians and their\nconstituents. We collect a two and a half years (September 2020 - February\n2023) constituency-level dataset for USA and UK that includes: (i) the Twitter\ntimelines (5.6 Million tweets) of elected political representatives (595 UK\nMembers of Parliament and 433 USA Representatives), (ii) the Nextdoor posts\n(21.8 Million posts) of the constituency (98.4% USA and 91.5% UK\nconstituencies). We find that elected politicians tend to be equally similar to\ntheir constituents in terms of content and style regardless of whether a\nconstituency elects a right or left-wing politician. The size of the electoral\nvictory and the level of income of a constituency shows a nuanced picture. The\nnarrower the electoral victory, the more similar the style and the more\ndissimilar the content is. The lower the income of a constituency, the more\nsimilar the content is. In terms of style, poorer constituencies tend to have a\nmore similar sentiment and more dissimilar psychological text traits (i.e.\nmeasured with LIWC categories).\n","authors":["Waleed Iqbal","Gareth Tyson","Ignacio Castro"],"pdf_url":"https://arxiv.org/pdf/2407.03255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03253v1","updated":"2024-07-03T16:34:56Z","published":"2024-07-03T16:34:56Z","title":"STF: Sentence Transformer Fine-Tuning For Topic Categorization With\n Limited Data","summary":" Nowadays, topic classification from tweets attracts considerable research\nattention. Different classification systems have been suggested thanks to these\nresearch efforts. Nevertheless, they face major challenges owing to low\nperformance metrics due to the limited amount of labeled data. We propose\nSentence Transformers Fine-tuning (STF), a topic detection system that\nleverages pretrained Sentence Transformers models and fine-tuning to classify\ntopics from tweets accurately. Moreover, extensive parameter sensitivity\nanalyses were conducted to finetune STF parameters for our topic classification\ntask to achieve the best performance results. Experiments on two benchmark\ndatasets demonstrated that (1) the proposed STF can be effectively used for\nclassifying tweet topics and outperforms the latest state-of-the-art\napproaches, and (2) the proposed STF does not require a huge amount of labeled\ntweets to achieve good accuracy, which is a limitation of many state-of-the-art\napproaches. Our main contribution is the achievement of promising results in\ntweet topic classification by applying pretrained sentence transformers\nlanguage models.\n","authors":["Kheir Eddine Daouadi","Yaakoub Boualleg","Oussama Guehairia"],"pdf_url":"https://arxiv.org/pdf/2407.03253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17761v2","updated":"2024-07-03T16:33:55Z","published":"2024-06-25T17:45:26Z","title":"CaLMQA: Exploring culturally specific long-form question answering\n across 23 languages","summary":" Large language models (LLMs) are used for long-form question answering\n(LFQA), which requires them to generate paragraph-length answers to complex\nquestions. While LFQA has been well-studied in English, this research has not\nbeen extended to other languages. To bridge this gap, we introduce CaLMQA, a\ncollection of 1.5K complex culturally specific questions spanning 23 languages\nand 51 culturally agnostic questions translated from English into 22 other\nlanguages. We define culturally specific questions as those uniquely or more\nlikely to be asked by people from cultures associated with the question's\nlanguage. We collect naturally-occurring questions from community web forums\nand hire native speakers to write questions to cover under-resourced,\nrarely-studied languages such as Fijian and Kirundi. Our dataset contains\ndiverse, complex questions that reflect cultural topics (e.g. traditions, laws,\nnews) and the language usage of native speakers. We automatically evaluate a\nsuite of open- and closed-source models on CaLMQA by detecting incorrect\nlanguage and token repetitions in answers, and observe that the quality of\nLLM-generated answers degrades significantly for some low-resource languages.\nLastly, we perform human evaluation on a subset of models and languages. Manual\nevaluation reveals that model performance is significantly worse for culturally\nspecific questions than for culturally agnostic questions. Our findings\nhighlight the need for further research in non-English LFQA and provide an\nevaluation framework.\n","authors":["Shane Arora","Marzena Karpinska","Hung-Ting Chen","Ipsita Bhattacharjee","Mohit Iyyer","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2406.17761v2.pdf","comment":"39 pages, 17 figures. Code and data available at\n https://github.com/2015aroras/CaLMQA. Revised argument in section 4, results\n unchanged"},{"id":"http://arxiv.org/abs/2407.01334v2","updated":"2024-07-03T16:31:52Z","published":"2024-07-01T14:41:59Z","title":"Protecting Privacy in Classifiers by Token Manipulation","summary":" Using language models as a remote service entails sending private information\nto an untrusted provider. In addition, potential eavesdroppers can intercept\nthe messages, thereby exposing the information. In this work, we explore the\nprospects of avoiding such data exposure at the level of text manipulation. We\nfocus on text classification models, examining various token mapping and\ncontextualized manipulation functions in order to see whether classifier\naccuracy may be maintained while keeping the original text unrecoverable. We\nfind that although some token mapping functions are easy and straightforward to\nimplement, they heavily influence performance on the downstream task, and via a\nsophisticated attacker can be reconstructed. In comparison, the contextualized\nmanipulation provides an improvement in performance.\n","authors":["Re'em Harel","Yair Elboher","Yuval Pinter"],"pdf_url":"https://arxiv.org/pdf/2407.01334v2.pdf","comment":"PrivateNLP@ACL 2024"},{"id":"http://arxiv.org/abs/2311.09325v2","updated":"2024-07-03T16:12:32Z","published":"2023-11-15T19:34:06Z","title":"Temperature-scaling surprisal estimates improve fit to human reading\n times -- but does it do so for the \"right reasons\"?","summary":" A wide body of evidence shows that human language processing difficulty is\npredicted by the information-theoretic measure surprisal, a word's negative log\nprobability in context. However, it is still unclear how to best estimate these\nprobabilities needed for predicting human processing difficulty -- while a\nlong-standing belief held that models with lower perplexity would provide more\naccurate estimates of word predictability, and therefore lead to better reading\ntime predictions, recent work has shown that for very large models,\npsycholinguistic predictive power decreases. One reason could be that language\nmodels might be more confident of their predictions than humans, because they\nhave had exposure to several magnitudes more data. In this paper, we test what\neffect temperature-scaling of large language model (LLM) predictions has on\nsurprisal estimates and their predictive power of reading times of English\ntexts. Firstly, we show that calibration of large language models typically\nimproves with model size, i.e. poorer calibration cannot account for poorer fit\nto reading times. Secondly, we find that temperature-scaling probabilities lead\nto a systematically better fit to reading times (up to 89% improvement in delta\nlog likelihood), across several reading time corpora. Finally, we show that\nthis improvement in fit is chiefly driven by words that are composed of\nmultiple subword tokens.\n","authors":["Tong Liu","Iza Škrjanec","Vera Demberg"],"pdf_url":"https://arxiv.org/pdf/2311.09325v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.03236v1","updated":"2024-07-03T16:05:20Z","published":"2024-07-03T16:05:20Z","title":"CATT: Character-based Arabic Tashkeel Transformer","summary":" Tashkeel, or Arabic Text Diacritization (ATD), greatly enhances the\ncomprehension of Arabic text by removing ambiguity and minimizing the risk of\nmisinterpretations caused by its absence. It plays a crucial role in improving\nArabic text processing, particularly in applications such as text-to-speech and\nmachine translation. This paper introduces a new approach to training ATD\nmodels. First, we finetuned two transformers, encoder-only and encoder-decoder,\nthat were initialized from a pretrained character-based BERT. Then, we applied\nthe Noisy-Student approach to boost the performance of the best model. We\nevaluated our models alongside 11 commercial and open-source models using two\nmanually labeled benchmark datasets: WikiNews and our CATT dataset. Our\nfindings show that our top model surpasses all evaluated models by relative\nDiacritic Error Rates (DERs) of 30.83\\% and 35.21\\% on WikiNews and CATT,\nrespectively, achieving state-of-the-art in ATD. In addition, we show that our\nmodel outperforms GPT-4-turbo on CATT dataset by a relative DER of 9.36\\%. We\nopen-source our CATT models and benchmark dataset for the research\ncommunity\\footnote{https://github.com/abjadai/catt}.\n","authors":["Faris Alasmary","Orjuwan Zaafarani","Ahmad Ghannam"],"pdf_url":"https://arxiv.org/pdf/2407.03236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03234v1","updated":"2024-07-03T16:03:42Z","published":"2024-07-03T16:03:42Z","title":"Self-Evaluation as a Defense Against Adversarial Attacks on LLMs","summary":" When LLMs are deployed in sensitive, human-facing settings, it is crucial\nthat they do not output unsafe, biased, or privacy-violating outputs. For this\nreason, models are both trained and instructed to refuse to answer unsafe\nprompts such as \"Tell me how to build a bomb.\" We find that, despite these\nsafeguards, it is possible to break model defenses simply by appending a space\nto the end of a model's input. In a study of eight open-source models, we\ndemonstrate that this acts as a strong enough attack to cause the majority of\nmodels to generate harmful outputs with very high success rates. We examine the\ncauses of this behavior, finding that the contexts in which single spaces occur\nin tokenized training data encourage models to generate lists when prompted,\noverriding training signals to refuse to answer unsafe requests. Our findings\nunderscore the fragile state of current model alignment and promote the\nimportance of developing more robust alignment methods. Code and data will be\nmade available at https://github.com/Linlt-leon/Adversarial-Alignments.\n","authors":["Hannah Brown","Leon Lin","Kenji Kawaguchi","Michael Shieh"],"pdf_url":"https://arxiv.org/pdf/2407.03234v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.03232v1","updated":"2024-07-03T16:03:10Z","published":"2024-07-03T16:03:10Z","title":"Single Character Perturbations Break LLM Alignment","summary":" When LLMs are deployed in sensitive, human-facing settings, it is crucial\nthat they do not output unsafe, biased, or privacy-violating outputs. For this\nreason, models are both trained and instructed to refuse to answer unsafe\nprompts such as \"Tell me how to build a bomb.\" We find that, despite these\nsafeguards, it is possible to break model defenses simply by appending a space\nto the end of a model's input. In a study of eight open-source models, we\ndemonstrate that this acts as a strong enough attack to cause the majority of\nmodels to generate harmful outputs with very high success rates. We examine the\ncauses of this behavior, finding that the contexts in which single spaces occur\nin tokenized training data encourage models to generate lists when prompted,\noverriding training signals to refuse to answer unsafe requests. Our findings\nunderscore the fragile state of current model alignment and promote the\nimportance of developing more robust alignment methods. Code and data will be\navailable at https://github.com/hannah-aught/space_attack.\n","authors":["Leon Lin","Hannah Brown","Kenji Kawaguchi","Michael Shieh"],"pdf_url":"https://arxiv.org/pdf/2407.03232v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.03227v1","updated":"2024-07-03T15:55:14Z","published":"2024-07-03T15:55:14Z","title":"Improving Retrieval-augmented Text-to-SQL with AST-based Ranking and\n Schema Pruning","summary":" We focus on Text-to-SQL semantic parsing from the perspective of Large\nLanguage Models. Motivated by challenges related to the size of commercial\ndatabase schemata and the deployability of business intelligence solutions, we\npropose an approach that dynamically retrieves input database information and\nuses abstract syntax trees to select few-shot examples for in-context learning.\n Furthermore, we investigate the extent to which an in-parallel semantic\nparser can be leveraged for generating $\\textit{approximated}$ versions of the\nexpected SQL queries, to support our retrieval. We take this approach to the\nextreme--we adapt a model consisting of less than $500$M parameters, to act as\nan extremely efficient approximator, enhancing it with the ability to process\nschemata in a parallelised manner. We apply our approach to monolingual and\ncross-lingual benchmarks for semantic parsing, showing improvements over\nstate-of-the-art baselines. Comprehensive experiments highlight the\ncontribution of modules involved in this retrieval-augmented generation\nsetting, revealing interesting directions for future work.\n","authors":["Zhili Shen","Pavlos Vougiouklis","Chenxin Diao","Kaustubh Vyas","Yuanyi Ji","Jeff Z. Pan"],"pdf_url":"https://arxiv.org/pdf/2407.03227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11924v3","updated":"2024-07-03T15:50:48Z","published":"2024-02-19T08:12:30Z","title":"Evaluating LLMs' Inherent Multi-hop Reasoning Ability","summary":" While Large Language Models (LLMs) excel in question-answering (QA) tasks,\ntheir multi-step reasoning abilities on multiple evidence integration on\nMulti-hop QA tasks remain underexplored. LLMs sometimes generate answers that\nrely on internal memory rather than reasoning given context, which brings\nconcerns about the evaluation quality of real reasoning abilities. The\ncounterfactual QA task can separate internal memory from reasoning abilities,\nbut focusing solely on final-QA performance without evaluating the multi-step\nreasoning process is insufficient for reporting LLMs' real reasoning abilities.\nCurrent Multi-hop QA (MHQA) benchmarks are factual and annotated on open-source\ncorpora such as Wikipedia, although useful for multi-step reasoning evaluation,\nshowing limitations due to potential data contamination in LLMs pre-training\nstage. To address this issue, we introduce the Inherent Reasoning Evaluation\n(IRE) method, a novel evaluation way that jointly evaluates the LLMs'\nchain-of-reasoning performance based on the first knowledge-edited\ncounterfactual multi-hop QA data which involves editing the original Wikipedia\npassages, reducing data contamination risks. The IRE comprehensively assesses\nreasoning chains through sub-QA and final-QA evaluations. Our comparisons\nreveal significant performance gaps for several LLMs between Wikipedia-based\nbenchmarks and IRE, deeming data contamination issues in existing benchmarks.\nWe believe that the IRE benchmark will enhance and facilitate trustworthy LLM\nevaluations.\n","authors":["Jian Wu","Linyi Yang","Zhen Wang","Manabu Okumura","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.11924v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09737v2","updated":"2024-07-03T15:41:20Z","published":"2023-06-16T10:05:47Z","title":"Using Natural Language Processing and Networks to Automate Structured\n Literature Reviews: An Application to Farmers Climate Change Adaptation","summary":" The fast-growing number of research articles makes it problematic for\nscholars to keep track of the new findings related to their areas of expertise.\nFurthermore, linking knowledge across disciplines in rapidly developing fields\nbecomes challenging for complex topics like climate change that demand\ninterdisciplinary solutions. At the same time, the rise of Black Box types of\ntext summarization makes it difficult to understand how text relationships are\nbuilt, let alone relate to existing theories conceptualizing cause-effect\nrelationships and permitting hypothesizing. This work aims to sensibly use\nNatural Language Processing by extracting variables relations and synthesizing\ntheir findings using networks while relating to key concepts dominant in\nrelevant disciplines. As an example, we apply our methodology to the analysis\nof farmers' adaptation to climate change. For this, we perform a Natural\nLanguage Processing analysis of publications returned by Scopus in August 2022.\nResults show that the use of Natural Language Processing together with networks\nin a descriptive manner offers a fast and interpretable way to synthesize\nliterature review findings as long as researchers back up results with theory.\n","authors":["Sofia Gil-Clavel","Tatiana Filatova"],"pdf_url":"https://arxiv.org/pdf/2306.09737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03211v1","updated":"2024-07-03T15:39:40Z","published":"2024-07-03T15:39:40Z","title":"How Does Quantization Affect Multilingual LLMs?","summary":" Quantization techniques are widely used to improve inference speed and\ndeployment of large language models. While a wide body of work examines the\nimpact of quantized LLMs on English tasks, none have examined the effect of\nquantization across languages. We conduct a thorough analysis of quantized\nmultilingual LLMs, focusing on their performance across languages and at\nvarying scales. We use automatic benchmarks, LLM-as-a-Judge methods, and human\nevaluation, finding that (1) harmful effects of quantization are apparent in\nhuman evaluation, and automatic metrics severely underestimate the detriment: a\n1.7% average drop in Japanese across automatic tasks corresponds to a 16.0%\ndrop reported by human evaluators on realistic prompts; (2) languages are\ndisparately affected by quantization, with non-Latin script languages impacted\nworst; and (3) challenging tasks such as mathematical reasoning degrade\nfastest. As the ability to serve low-compute models is critical for wide global\nadoption of NLP technologies, our results urge consideration of multilingual\nperformance as a key evaluation criterion for efficient models.\n","authors":["Kelly Marchisio","Saurabh Dash","Hongyu Chen","Dennis Aumiller","Ahmet Üstün","Sara Hooker","Sebastian Ruder"],"pdf_url":"https://arxiv.org/pdf/2407.03211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08593v2","updated":"2024-07-03T15:23:59Z","published":"2024-03-13T14:59:07Z","title":"Call Me When Necessary: LLMs can Efficiently and Faithfully Reason over\n Structured Environments","summary":" Large Language Models (LLMs) have shown potential in reasoning over\nstructured environments, e.g., knowledge graph and table. Such tasks typically\nrequire multi-hop reasoning, i.e., match natural language utterance with\ninstances in the environment. Previous methods leverage LLMs to incrementally\nbuild a reasoning path, where the LLMs either invoke tools or pick up schemas\nby step-by-step interacting with the environment. We propose\nReasoning-Path-Editing (Readi), a novel framework where LLMs can efficiently\nand faithfully reason over structured environments. In Readi, LLMs initially\ngenerate a reasoning path given a query, and edit the path only when necessary.\nWe instantiate the path on structured environments and provide feedback to edit\nthe path if anything goes wrong. Experimental results on three KGQA and two\nTableQA datasets show the effectiveness of Readi, significantly surpassing\nprevious LLM-based methods (by 9.1% Hit@1 on WebQSP, 12.4% on MQA-3H and 9.5%\non WTQ), comparable with state-of-the-art fine-tuned methods (67% on CWQ and\n74.7% on WebQSP) and substantially boosting the vanilla LLMs (by 14.9% on CWQ).\nOur code will be available on https://aka.ms/readi.\n","authors":["Sitao Cheng","Ziyuan Zhuang","Yong Xu","Fangkai Yang","Chaoyun Zhang","Xiaoting Qin","Xiang Huang","Ling Chen","Qingwei Lin","Dongmei Zhang","Saravan Rajmohan","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08593v2.pdf","comment":"Accepted by ACL 2024 Findings. 21 pages, 7 figures, 17 tables"},{"id":"http://arxiv.org/abs/2407.00668v2","updated":"2024-07-03T15:18:40Z","published":"2024-06-30T11:27:50Z","title":"HRDE: Retrieval-Augmented Large Language Models for Chinese Health Rumor\n Detection and Explainability","summary":" As people increasingly prioritize their health, the speed and breadth of\nhealth information dissemination on the internet have also grown. At the same\ntime, the presence of false health information (health rumors) intermingled\nwith genuine content poses a significant potential threat to public health.\nHowever, current research on Chinese health rumors still lacks a large-scale,\npublic, and open-source dataset of health rumor information, as well as\neffective and reliable rumor detection methods. This paper addresses this gap\nby constructing a dataset containing 1.12 million health-related rumors\n(HealthRCN) through web scraping of common health-related questions and a\nseries of data processing steps. HealthRCN is the largest known dataset of\nChinese health information rumors to date. Based on this dataset, we propose\nretrieval-augmented large language models for Chinese health rumor detection\nand explainability (HRDE). This model leverages retrieved relevant information\nto accurately determine whether the input health information is a rumor and\nprovides explanatory responses, effectively aiding users in verifying the\nauthenticity of health information. In evaluation experiments, we compared\nmultiple models and found that HRDE outperformed them all, including\nGPT-4-1106-Preview, in rumor detection accuracy and answer quality. HRDE\nachieved an average accuracy of 91.04% and an F1 score of 91.58%.\n","authors":["Yanfang Chen","Ding Chen","Shichao Song","Simin Niu","Hanyu Wang","Zeyun Tang","Feiyu Xiong","Zhiyu Li"],"pdf_url":"https://arxiv.org/pdf/2407.00668v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03192v1","updated":"2024-07-03T15:18:29Z","published":"2024-07-03T15:18:29Z","title":"CiteAssist: A System for Automated Preprint Citation and BibTeX\n Generation","summary":" We present CiteAssist, a system to automate the generation of BibTeX entries\nfor preprints, streamlining the process of bibliographic annotation. Our system\nextracts metadata, such as author names, titles, publication dates, and\nkeywords, to create standardized annotations within the document. CiteAssist\nautomatically attaches the BibTeX citation to the end of a PDF and links it on\nthe first page of the document so other researchers gain immediate access to\nthe correct citation of the article. This method promotes platform flexibility\nby ensuring that annotations remain accessible regardless of the repository\nused to publish or access the preprint. The annotations remain available even\nif the preprint is viewed externally to CiteAssist. Additionally, the system\nadds relevant related papers based on extracted keywords to the preprint,\nproviding researchers with additional publications besides those in related\nwork for further reading. Researchers can enhance their preprints organization\nand reference management workflows through a free and publicly available web\ninterface.\n","authors":["Lars Benedikt Kaesberg","Terry Ruas","Jan Philip Wahle","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2407.03192v1.pdf","comment":"Published at SDProc @ ACL 2024"},{"id":"http://arxiv.org/abs/2305.15060v4","updated":"2024-07-03T15:09:52Z","published":"2023-05-24T11:49:52Z","title":"Who Wrote this Code? Watermarking for Code Generation","summary":" Since the remarkable generation performance of large language models raised\nethical and legal concerns, approaches to detect machine-generated text by\nembedding watermarks are being developed. However, we discover that the\nexisting works fail to function appropriately in code generation tasks due to\nthe task's nature of having low entropy. Extending a logit-modifying watermark\nmethod, we propose Selective WatErmarking via Entropy Thresholding (SWEET),\nwhich enhances detection ability and mitigates code quality degeneration by\nremoving low-entropy segments at generating and detecting watermarks. Our\nexperiments show that SWEET significantly improves code quality preservation\nwhile outperforming all baselines, including post-hoc detection methods, in\ndetecting machine-generated code text. Our code is available in\nhttps://github.com/hongcheki/sweet-watermark.\n","authors":["Taehyun Lee","Seokhee Hong","Jaewoo Ahn","Ilgee Hong","Hwaran Lee","Sangdoo Yun","Jamin Shin","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2305.15060v4.pdf","comment":"To be presented at ACL 2024"},{"id":"http://arxiv.org/abs/2405.00557v2","updated":"2024-07-03T15:04:25Z","published":"2024-05-01T15:06:05Z","title":"Mixture of insighTful Experts (MoTE): The Synergy of Thought Chains and\n Expert Mixtures in Self-Alignment","summary":" As the capabilities of large language models (LLMs) have expanded\ndramatically, aligning these models with human values presents a significant\nchallenge. Traditional alignment strategies rely heavily on human intervention,\nsuch as Supervised Fine-Tuning (SFT) and Reinforcement Learning from Human\nFeedback (RLHF), or on the self-alignment capacities of LLMs, which usually\nrequire a strong LLM's emergent ability to improve its original bad answer. To\naddress these challenges, we propose a novel self-alignment method that\nutilizes a Chain of Thought (CoT) approach, termed AlignCoT. This method\nencompasses stages of Question Analysis, Answer Guidance, and Safe Answer\nproduction. It is designed to enable LLMs to generate high-quality, safe\nresponses throughout various stages of their development. Furthermore, we\nintroduce the Mixture of insighTful Experts (MoTE) architecture, which applies\nmixture of experts to enhance each component of the AlignCoT process, markedly\nincreasing alignment efficiency. The MoTE approach not only outperforms\nexisting methods in aligning LLMs with human values but also highlights the\nbenefits of using self-generated data, revealing the dual benefits of improved\nalignment and training efficiency.\n","authors":["Zhili Liu","Yunhao Gou","Kai Chen","Lanqing Hong","Jiahui Gao","Fei Mi","Yu Zhang","Zhenguo Li","Xin Jiang","Qun Liu","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2405.00557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03181v1","updated":"2024-07-03T15:01:18Z","published":"2024-07-03T15:01:18Z","title":"Fine-Tuning with Divergent Chains of Thought Boosts Reasoning Through\n Self-Correction in Language Models","summary":" Requiring a Large Language Model to generate intermediary reasoning steps has\nbeen shown to be an effective way of boosting performance. In fact, it has been\nfound that instruction tuning on these intermediary reasoning steps improves\nmodel performance. In this work, we present a novel method of further improving\nperformance by requiring models to compare multiple reasoning chains before\ngenerating a solution in a single inference step. We call this method Divergent\nCoT (DCoT). We find that instruction tuning on DCoT datasets boosts the\nperformance of even smaller, and therefore more accessible, LLMs. Through a\nrigorous set of experiments spanning a wide range of tasks that require various\nreasoning types, we show that fine-tuning on DCoT consistently improves\nperformance over the CoT baseline across model families and scales (1.3B to\n70B). Through a combination of empirical and manual evaluation, we additionally\nshow that these performance gains stem from models generating multiple\ndivergent reasoning chains in a single inference step, indicative of the\nenabling of self-correction in language models. Our code and data are publicly\navailable at https://github.com/UKPLab/arxiv2024-divergent-cot.\n","authors":["Haritz Puerto","Tilek Chubakov","Xiaodan Zhu","Harish Tayyar Madabushi","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2407.03181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07212v2","updated":"2024-07-03T14:49:15Z","published":"2024-06-11T12:41:54Z","title":"Towards Human-AI Collaboration in Healthcare: Guided Deferral Systems\n with Large Language Models","summary":" Large language models (LLMs) present a valuable technology for various\napplications in healthcare, but their tendency to hallucinate introduces\nunacceptable uncertainty in critical decision-making situations. Human-AI\ncollaboration (HAIC) can mitigate this uncertainty by combining human and AI\nstrengths for better outcomes. This paper presents a novel guided deferral\nsystem that provides intelligent guidance when AI defers cases to human\ndecision-makers. We leverage LLMs' verbalisation capabilities and internal\nstates to create this system, demonstrating that fine-tuning small-scale LLMs\nwith data from large-scale LLMs greatly enhances performance while maintaining\ncomputational efficiency and data privacy. A pilot study showcases the\neffectiveness of our proposed deferral system.\n","authors":["Joshua Strong","Qianhui Men","Alison Noble"],"pdf_url":"https://arxiv.org/pdf/2406.07212v2.pdf","comment":"Accepted to ICML 2024 Workshop on Large Language Models and Cognition"},{"id":"http://arxiv.org/abs/2405.13022v2","updated":"2024-07-03T14:46:52Z","published":"2024-05-15T13:35:43Z","title":"LLMs can learn self-restraint through iterative self-reflection","summary":" In order to be deployed safely, Large Language Models (LLMs) must be capable\nof dynamically adapting their behavior based on their level of knowledge and\nuncertainty associated with specific topics. This adaptive behavior, which we\nrefer to as self-restraint, is non-trivial to teach since it depends on the\ninternal knowledge of an LLM. By default, LLMs are trained to maximize the next\ntoken likelihood, which does not teach the model to modulate its answer based\non its level of uncertainty. In order to learn self-restraint, we devise a\nutility function that can encourage the model to produce responses only when it\nis confident in them. This utility function can be used to score generation of\ndifferent length and abstention. To optimize this function, we introduce\nReSearch, a process of \"self-reflection\" consisting of iterative self-prompting\nand self-evaluation. We use the ReSearch algorithm to generate synthetic data\non which we finetune our models. Compared to their original versions, our\nresulting models generate fewer \\emph{hallucinations} overall at no additional\ninference cost, for both known and unknown topics, as the model learns to\nselectively restrain itself. In addition, our method elegantly incorporates the\nability to abstain by augmenting the samples generated by the model during the\nsearch procedure with an answer expressing abstention.\n","authors":["Alexandre Piché","Aristides Milios","Dzmitry Bahdanau","Chris Pal"],"pdf_url":"https://arxiv.org/pdf/2405.13022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03169v1","updated":"2024-07-03T14:42:49Z","published":"2024-07-03T14:42:49Z","title":"Investigating Decoder-only Large Language Models for Speech-to-text\n Translation","summary":" Large language models (LLMs), known for their exceptional reasoning\ncapabilities, generalizability, and fluency across diverse domains, present a\npromising avenue for enhancing speech-related tasks. In this paper, we focus on\nintegrating decoder-only LLMs to the task of speech-to-text translation (S2TT).\nWe propose a decoder-only architecture that enables the LLM to directly consume\nthe encoded speech representation and generate the text translation.\nAdditionally, we investigate the effects of different parameter-efficient\nfine-tuning techniques and task formulation. Our model achieves\nstate-of-the-art performance on CoVoST 2 and FLEURS among models trained\nwithout proprietary data. We also conduct analyses to validate the design\nchoices of our proposed model and bring insights to the integration of LLMs to\nS2TT.\n","authors":["Chao-Wei Huang","Hui Lu","Hongyu Gong","Hirofumi Inaguma","Ilia Kulikov","Ruslan Mavlyutov","Sravya Popuri"],"pdf_url":"https://arxiv.org/pdf/2407.03169v1.pdf","comment":"Accepted to Interspeech 2024"},{"id":"http://arxiv.org/abs/2407.03160v1","updated":"2024-07-03T14:35:16Z","published":"2024-07-03T14:35:16Z","title":"SOS! Soft Prompt Attack Against Open-Source Large Language Models","summary":" Open-source large language models (LLMs) have become increasingly popular\namong both the general public and industry, as they can be customized,\nfine-tuned, and freely used. However, some open-source LLMs require approval\nbefore usage, which has led to third parties publishing their own easily\naccessible versions. Similarly, third parties have been publishing fine-tuned\nor quantized variants of these LLMs. These versions are particularly appealing\nto users because of their ease of access and reduced computational resource\ndemands. This trend has increased the risk of training time attacks,\ncompromising the integrity and security of LLMs. In this work, we present a new\ntraining time attack, SOS, which is designed to be low in computational demand\nand does not require clean data or modification of the model weights, thereby\nmaintaining the model's utility intact. The attack addresses security issues in\nvarious scenarios, including the backdoor attack, jailbreak attack, and prompt\nstealing attack. Our experimental findings demonstrate that the proposed attack\nis effective across all evaluated targets. Furthermore, we present the other\nside of our SOS technique, namely the copyright token -- a novel technique that\nenables users to mark their copyrighted content and prevent models from using\nit.\n","authors":["Ziqing Yang","Michael Backes","Yang Zhang","Ahmed Salem"],"pdf_url":"https://arxiv.org/pdf/2407.03160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03157v1","updated":"2024-07-03T14:34:03Z","published":"2024-07-03T14:34:03Z","title":"Let the Code LLM Edit Itself When You Edit the Code","summary":" In this work, we investigate a typical scenario in code generation where a\ndeveloper edits existing code in real time and requests a code assistant, e.g.,\na large language model, to re-predict the next token or next line on the fly.\nNaively, the LLM needs to re-encode the entire KV cache to provide an accurate\nprediction. However, this process is computationally expensive, especially when\nthe sequence length is long. Simply encoding the edited subsequence and\nintegrating it to the original KV cache meets the temporal confusion problem,\nleading to significantly worse performance. We address this efficiency and\naccuracy trade-off by introducing \\underline{\\textbf{Positional\n\\textbf{I}ntegrity \\textbf{E}ncoding} (PIE). Building upon the rotary\npositional encoding, PIE first removes the rotary matrices in the Key cache\nthat introduce temporal confusion and then reapplies the correct rotary\nmatrices. This process ensures that positional relationships between tokens are\ncorrect and requires only a single round of matrix multiplication. We validate\nthe effectiveness of PIE through extensive experiments on the RepoBench-C-8k\ndataset, utilizing DeepSeek-Coder models with 1.3B, 6.7B, and 33B parameters.\nOur evaluation includes three real-world coding tasks: code insertion, code\ndeletion, and multi-place code editing. Results demonstrate that PIE reduces\ncomputational overhead by over 85% compared to the standard full recomputation\napproach across all model sizes and tasks while well approximating the model\nperformance.\n","authors":["Zhenyu He","Jun Zhang","Shengjie Luo","Jingjing Xu","Zhi Zhang","Di He"],"pdf_url":"https://arxiv.org/pdf/2407.03157v1.pdf","comment":"Preprint. Work in Progress"},{"id":"http://arxiv.org/abs/2403.19887v2","updated":"2024-07-03T14:30:33Z","published":"2024-03-28T23:55:06Z","title":"Jamba: A Hybrid Transformer-Mamba Language Model","summary":" We present Jamba, a new base large language model based on a novel hybrid\nTransformer-Mamba mixture-of-experts (MoE) architecture. Specifically, Jamba\ninterleaves blocks of Transformer and Mamba layers, enjoying the benefits of\nboth model families. MoE is added in some of these layers to increase model\ncapacity while keeping active parameter usage manageable. This flexible\narchitecture allows resource- and objective-specific configurations. In the\nparticular configuration we have implemented, we end up with a powerful model\nthat fits in a single 80GB GPU. Built at large scale, Jamba provides high\nthroughput and small memory footprint compared to vanilla Transformers, and at\nthe same time state-of-the-art performance on standard language model\nbenchmarks and long-context evaluations. Remarkably, the model presents strong\nresults for up to 256K tokens context length. We study various architectural\ndecisions, such as how to combine Transformer and Mamba layers, and how to mix\nexperts, and show that some of them are crucial in large scale modeling. We\nalso describe several interesting properties of these architectures which the\ntraining and evaluation of Jamba have revealed, and plan to release checkpoints\nfrom various ablation runs, to encourage further exploration of this novel\narchitecture. We make the weights of our implementation of Jamba publicly\navailable under a permissive license.\n","authors":["Opher Lieber","Barak Lenz","Hofit Bata","Gal Cohen","Jhonathan Osin","Itay Dalmedigos","Erez Safahi","Shaked Meirom","Yonatan Belinkov","Shai Shalev-Shwartz","Omri Abend","Raz Alon","Tomer Asida","Amir Bergman","Roman Glozman","Michael Gokhman","Avashalom Manevich","Nir Ratner","Noam Rozen","Erez Shwartz","Mor Zusman","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2403.19887v2.pdf","comment":"Webpage: https://www.ai21.com/jamba"},{"id":"http://arxiv.org/abs/2407.03145v1","updated":"2024-07-03T14:23:36Z","published":"2024-07-03T14:23:36Z","title":"Enhancing Translation Accuracy of Large Language Models through\n Continual Pre-Training on Parallel Data","summary":" In this paper, we propose a two-phase training approach where pre-trained\nlarge language models are continually pre-trained on parallel data and then\nsupervised fine-tuned with a small amount of high-quality parallel data. To\ninvestigate the effectiveness of our proposed approach, we conducted continual\npre-training with a 3.8B-parameter model and parallel data across eight\ndifferent formats. We evaluate these methods on thirteen test sets for\nJapanese-to-English and English-to-Japanese translation. The results\ndemonstrate that when utilizing parallel data in continual pre-training, it is\nessential to alternate between source and target sentences. Additionally, we\ndemonstrated that the translation accuracy improves only for translation\ndirections where the order of source and target sentences aligns between\ncontinual pre-training data and inference. In addition, we demonstrate that the\nLLM-based translation model is more robust in translating spoken language and\nachieves higher accuracy with less training data compared to supervised\nencoder-decoder models. We also show that the highest accuracy is achieved when\nthe data for continual pre-training consists of interleaved source and target\nsentences and when tags are added to the source sentences.\n","authors":["Minato Kondo","Takehito Utsuro","Masaaki Nagata"],"pdf_url":"https://arxiv.org/pdf/2407.03145v1.pdf","comment":"IWSLT2024, 18 pages"},{"id":"http://arxiv.org/abs/2407.03132v1","updated":"2024-07-03T14:13:04Z","published":"2024-07-03T14:13:04Z","title":"Speaker- and Text-Independent Estimation of Articulatory Movements and\n Phoneme Alignments from Speech","summary":" This paper introduces a novel combination of two tasks, previously treated\nseparately: acoustic-to-articulatory speech inversion (AAI) and\nphoneme-to-articulatory (PTA) motion estimation. We refer to this joint task as\nacoustic phoneme-to-articulatory speech inversion (APTAI) and explore two\ndifferent approaches, both working speaker- and text-independently during\ninference. We use a multi-task learning setup, with the end-to-end goal of\ntaking raw speech as input and estimating the corresponding articulatory\nmovements, phoneme sequence, and phoneme alignment. While both proposed\napproaches share these same requirements, they differ in their way of achieving\nphoneme-related predictions: one is based on frame classification, the other on\na two-staged training procedure and forced alignment. We reach competitive\nperformance of 0.73 mean correlation for the AAI task and achieve up to\napproximately 87% frame overlap compared to a state-of-the-art text-dependent\nphoneme force aligner.\n","authors":["Tobias Weise","Philipp Klumpp","Kubilay Can Demir","Paula Andrea Pérez-Toro","Maria Schuster","Elmar Noeth","Bjoern Heismann","Andreas Maier","Seung Hee Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03132v1.pdf","comment":"to be published in Interspeech 2024 proceedings"},{"id":"http://arxiv.org/abs/2407.03129v1","updated":"2024-07-03T14:12:04Z","published":"2024-07-03T14:12:04Z","title":"Social Bias Evaluation for Large Language Models Requires Prompt\n Variations","summary":" Warning: This paper contains examples of stereotypes and biases. Large\nLanguage Models (LLMs) exhibit considerable social biases, and various studies\nhave tried to evaluate and mitigate these biases accurately. Previous studies\nuse downstream tasks as prompts to examine the degree of social biases for\nevaluation and mitigation. While LLMs' output highly depends on prompts,\nprevious studies evaluating and mitigating bias have often relied on a limited\nvariety of prompts. In this paper, we investigate the sensitivity of LLMs when\nchanging prompt variations (task instruction and prompt, few-shot examples,\ndebias-prompt) by analyzing task performance and social bias of LLMs. Our\nexperimental results reveal that LLMs are highly sensitive to prompts to the\nextent that the ranking of LLMs fluctuates when comparing models for task\nperformance and social bias. Additionally, we show that LLMs have tradeoffs\nbetween performance and social bias caused by the prompts. Less bias from\nprompt setting may result in reduced performance. Moreover, the ambiguity of\ninstances is one of the reasons for this sensitivity to prompts in advanced\nLLMs, leading to various outputs. We recommend using diverse prompts, as in\nthis study, to compare the effects of prompts on social bias in LLMs.\n","authors":["Rem Hida","Masahiro Kaneko","Naoaki Okazaki"],"pdf_url":"https://arxiv.org/pdf/2407.03129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14322v2","updated":"2024-07-03T14:05:20Z","published":"2024-06-20T13:54:32Z","title":"Mind the Privacy Unit! User-Level Differential Privacy for Language\n Model Fine-Tuning","summary":" Large language models (LLMs) have emerged as powerful tools for tackling\ncomplex tasks across diverse domains, but they also raise privacy concerns when\nfine-tuned on sensitive data due to potential memorization. While differential\nprivacy (DP) offers a promising solution by ensuring models are 'almost\nindistinguishable' with or without any particular privacy unit, current\nevaluations on LLMs mostly treat each example (text record) as the privacy\nunit. This leads to uneven user privacy guarantees when contributions per user\nvary. We therefore study user-level DP motivated by applications where it\nnecessary to ensure uniform privacy protection across users. We present a\nsystematic evaluation of user-level DP for LLM fine-tuning on natural language\ngeneration tasks. Focusing on two mechanisms for achieving user-level DP\nguarantees, Group Privacy and User-wise DP-SGD, we investigate design choices\nlike data selection strategies and parameter tuning for the best\nprivacy-utility tradeoff.\n","authors":["Lynn Chua","Badih Ghazi","Yangsibo Huang","Pritish Kamath","Ravi Kumar","Daogao Liu","Pasin Manurangsi","Amer Sinha","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04561v2","updated":"2024-07-03T14:01:52Z","published":"2023-09-08T19:27:01Z","title":"Four Ways to Improve Verbo-visual Fusion for Dense 3D Visual Grounding","summary":" 3D visual grounding is the task of localizing the object in a 3D scene which\nis referred by a description in natural language. With a wide range of\napplications ranging from autonomous indoor robotics to AR/VR, the task has\nrecently risen in popularity. A common formulation to tackle 3D visual\ngrounding is grounding-by-detection, where localization is done via bounding\nboxes. However, for real-life applications that require physical interactions,\na bounding box insufficiently describes the geometry of an object. We therefore\ntackle the problem of dense 3D visual grounding, i.e. referral-based 3D\ninstance segmentation. We propose a dense 3D grounding network ConcreteNet,\nfeaturing four novel stand-alone modules that aim to improve grounding\nperformance for challenging repetitive instances, i.e. instances with\ndistractors of the same semantic class. First, we introduce a bottom-up\nattentive fusion module that aims to disambiguate inter-instance relational\ncues, next, we construct a contrastive training scheme to induce separation in\nthe latent space, we then resolve view-dependent utterances via a learned\nglobal camera token, and finally we employ multi-view ensembling to improve\nreferred mask quality. ConcreteNet ranks 1st on the challenging ScanRefer\nonline benchmark and has won the ICCV 3rd Workshop on Language for 3D Scenes\n\"3D Object Localization\" challenge.\n","authors":["Ozan Unal","Christos Sakaridis","Suman Saha","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2309.04561v2.pdf","comment":"Winner of the ICCV 2023 ScanRefer Challenge. Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2402.05369v2","updated":"2024-07-03T13:53:06Z","published":"2024-02-08T02:58:47Z","title":"Noise Contrastive Alignment of Language Models with Explicit Rewards","summary":" User intentions are typically formalized as evaluation rewards to be\nmaximized when fine-tuning language models (LMs). Existing alignment methods,\nsuch as Direct Preference Optimization (DPO), are mainly tailored for pairwise\npreference data where rewards are implicitly defined rather than explicitly\ngiven. In this paper, we introduce a general framework for LM alignment,\nleveraging Noise Contrastive Estimation (NCE) to bridge the gap in handling\nreward datasets explicitly annotated with scalar evaluations. Our framework\ncomprises two parallel algorithms, NCA and InfoNCA, both enabling the direct\nextraction of an LM policy from reward data as well as preference data.\nNotably, we show that the DPO loss is a special case of our proposed InfoNCA\nobjective under pairwise preference settings, thereby integrating and extending\ncurrent alignment theories. By comparing NCA and InfoNCA, we demonstrate that\nthe well-observed decreasing-likelihood trend of DPO/InfoNCA is caused by their\nfocus on adjusting relative likelihood across different responses. In contrast,\nNCA optimizes the absolute likelihood for each response, thereby effectively\npreventing the chosen likelihood from decreasing. We evaluate our methods in\nboth reward and preference settings with Mistral-8*7B and 7B models.\nExperiments suggest that InfoNCA/NCA surpasses various preference baselines\nwhen reward datasets are available. We also find NCA significantly outperforms\nDPO in complex reasoning tasks like math and coding.\n","authors":["Huayu Chen","Guande He","Lifan Yuan","Ganqu Cui","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.05369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13228v2","updated":"2024-07-03T13:46:33Z","published":"2024-02-20T18:42:34Z","title":"Smaug: Fixing Failure Modes of Preference Optimisation with DPO-Positive","summary":" Direct Preference Optimisation (DPO) is effective at significantly improving\nthe performance of large language models (LLMs) on downstream tasks such as\nreasoning, summarisation, and alignment. Using pairs of preferred and\ndispreferred data, DPO models the relative probability of picking one response\nover another. In this work, first we show theoretically that the standard DPO\nloss can lead to a reduction of the model's likelihood of the preferred\nexamples, as long as the relative probability between the preferred and\ndispreferred classes increases. We then show empirically that this phenomenon\noccurs when fine-tuning LLMs on common datasets, especially datasets in which\nthe edit distance between pairs of completions is low. Using these insights, we\ndesign DPO-Positive (DPOP), a new loss function and training procedure which\navoids this failure mode. Surprisingly, we find that DPOP outperforms DPO and\nother fine-tuning procedures across a wide variety of datasets and downstream\ntasks, including datasets with high edit distances between completions.\nFurthermore, we find that the DPOP-tuned model outperforms the DPO-tuned model\n(all else equal) on benchmarks independent of the fine-tuning data, such as\nMT-Bench. Finally, using DPOP, we create and open-source Smaug-34B and\nSmaug-72B, with the latter becoming the first open-source LLM to surpass an\naverage accuracy of 80% on the HuggingFace Open LLM Leaderboard.\n","authors":["Arka Pal","Deep Karkhanis","Samuel Dooley","Manley Roberts","Siddartha Naidu","Colin White"],"pdf_url":"https://arxiv.org/pdf/2402.13228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03104v1","updated":"2024-07-03T13:41:44Z","published":"2024-07-03T13:41:44Z","title":"KeyVideoLLM: Towards Large-scale Video Keyframe Selection","summary":" Recently, with the rise of web videos, managing and understanding large-scale\nvideo datasets has become increasingly important. Video Large Language Models\n(VideoLLMs) have emerged in recent years due to their strong video\nunderstanding capabilities. However, training and inference processes for\nVideoLLMs demand vast amounts of data, presenting significant challenges to\ndata management, particularly regarding efficiency, robustness, and\neffectiveness. In this work, we present KeyVideoLLM, a text-video frame\nsimilarity-based keyframe selection method designed to manage VideoLLM data\nefficiently, robustly, and effectively. Specifically, KeyVideoLLM achieves a\nremarkable data compression rate of up to 60.9 times, substantially lowering\ndisk space requirements, which proves its high efficiency. Additionally, it\nmaintains a 100% selection success rate across all video formats and scales,\nenhances processing speed by up to 200 times compared to existing keyframe\nselection methods, and does not require hyperparameter tuning. Beyond its\noutstanding efficiency and robustness, KeyVideoLLM further improves model\nperformance in video question-answering tasks during both training and\ninference stages. Notably, it consistently achieved the state-of-the-art (SoTA)\nexperimental results on diverse datasets.\n","authors":["Hao Liang","Jiapeng Li","Tianyi Bai","Chong Chen","Conghui He","Bin Cui","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03103v1","updated":"2024-07-03T13:41:31Z","published":"2024-07-03T13:41:31Z","title":"Cactus: Towards Psychological Counseling Conversations using Cognitive\n Behavioral Theory","summary":" Recently, the demand for psychological counseling has significantly increased\nas more individuals express concerns about their mental health. This surge has\naccelerated efforts to improve the accessibility of counseling by using large\nlanguage models (LLMs) as counselors. To ensure client privacy, training\nopen-source LLMs faces a key challenge: the absence of realistic counseling\ndatasets. To address this, we introduce Cactus, a multi-turn dialogue dataset\nthat emulates real-life interactions using the goal-oriented and structured\napproach of Cognitive Behavioral Therapy (CBT). We create a diverse and\nrealistic dataset by designing clients with varied, specific personas, and\nhaving counselors systematically apply CBT techniques in their interactions. To\nassess the quality of our data, we benchmark against established psychological\ncriteria used to evaluate real counseling sessions, ensuring alignment with\nexpert evaluations. Experimental results demonstrate that Camel, a model\ntrained with Cactus, outperforms other models in counseling skills,\nhighlighting its effectiveness and potential as a counseling agent. We make our\ndata, model, and code publicly available.\n","authors":["Suyeon Lee","Sunghwan Kim","Minju Kim","Dongjin Kang","Dongil Yang","Harim Kim","Minseok Kang","Dayi Jung","Min Hee Kim","Seungbeen Lee","Kyoung-Mee Chung","Youngjae Yu","Dongha Lee","Jinyoung Yeo"],"pdf_url":"https://arxiv.org/pdf/2407.03103v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2406.16203v3","updated":"2024-07-03T13:18:50Z","published":"2024-06-23T19:49:10Z","title":"LLMs' Classification Performance is Overclaimed","summary":" In many classification tasks designed for AI or human to solve, gold labels\nare typically included within the label space by default, often posed as \"which\nof the following is correct?\" This standard setup has traditionally highlighted\nthe strong performance of advanced AI, particularly top-performing Large\nLanguage Models (LLMs), in routine classification tasks. However, when the gold\nlabel is intentionally excluded from the label space, it becomes evident that\nLLMs still attempt to select from the available label candidates, even when\nnone are correct. This raises a pivotal question: Do LLMs truly demonstrate\ntheir intelligence in understanding the essence of classification tasks?\n In this study, we evaluate both closed-source and open-source LLMs across\nrepresentative classification tasks, arguing that the perceived performance of\nLLMs is overstated due to their inability to exhibit the expected comprehension\nof the task. This paper makes a threefold contribution: i) To our knowledge,\nthis is the first work to identify the limitations of LLMs in classification\ntasks when gold labels are absent. We define this task as Classify-w/o-Gold and\npropose it as a new testbed for LLMs. ii) We introduce a benchmark, Know-No,\ncomprising two existing classification tasks and one new task, to evaluate\nClassify-w/o-Gold. iii) This work defines and advocates for a new evaluation\nmetric, OmniAccuracy, which assesses LLMs' performance in classification tasks\nboth when gold labels are present and absent.\n","authors":["Hanzi Xu","Renze Lou","Jiangshu Du","Vahid Mahzoon","Elmira Talebianaraki","Zhuoan Zhou","Elizabeth Garrison","Slobodan Vucetic","Wenpeng Yin"],"pdf_url":"https://arxiv.org/pdf/2406.16203v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12213v3","updated":"2024-07-03T12:59:21Z","published":"2024-06-18T02:25:33Z","title":"LLM-Oracle Machines","summary":" Contemporary AI applications leverage large language models (LLMs) to harness\ntheir knowledge and reasoning abilities for natural language processing tasks.\nThis approach shares similarities with the concept of oracle Turing machines\n(OTMs). To capture the broader potential of these computations, including those\nnot yet realized, we propose an extension to OTMs: the LLM-oracle machine\n(LLM-OM), by employing a cluster of LLMs as the oracle. Each LLM acts as a\nblack box, capable of answering queries within its expertise, albeit with a\ndelay. We introduce four variants of the LLM-OM: basic, augmented,\nfault-avoidance, and $\\epsilon$-fault. The first two are commonly observed in\nexisting AI applications. The latter two are specifically designed to address\nthe challenges of LLM hallucinations, biases, and inconsistencies, aiming to\nensure reliable outcomes.\n","authors":["Jie Wang"],"pdf_url":"https://arxiv.org/pdf/2406.12213v3.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2406.18045v2","updated":"2024-07-03T12:56:40Z","published":"2024-06-26T03:43:09Z","title":"PharmaGPT: Domain-Specific Large Language Models for Bio-Pharmaceutical\n and Chemistry","summary":" Large language models (LLMs) have revolutionized Natural Language Processing\n(NLP) by by minimizing the need for complex feature engineering. However, the\napplication of LLMs in specialized domains like biopharmaceuticals and\nchemistry remains largely unexplored. These fields are characterized by\nintricate terminologies, specialized knowledge, and a high demand for precision\nareas where general purpose LLMs often fall short. In this study, we introduce\nPharmGPT, a suite of multilingual LLMs with 13 billion and 70 billion\nparameters, specifically trained on a comprehensive corpus of hundreds of\nbillions of tokens tailored to the Bio-Pharmaceutical and Chemical sectors. Our\nevaluation shows that PharmGPT matches or surpasses existing general models on\nkey benchmarks, such as NAPLEX, demonstrating its exceptional capability in\ndomain-specific tasks. This advancement establishes a new benchmark for LLMs in\nthe Bio-Pharmaceutical and Chemical fields, addressing the existing gap in\nspecialized language modeling. Furthermore, this suggests a promising path for\nenhanced research and development in these specialized areas, paving the way\nfor more precise and effective applications of NLP in specialized domains.\n","authors":["Linqing Chen","Weilei Wang","Zilong Bai","Peng Xu","Yan Fang","Jie Fang","Wentao Wu","Lizhi Zhou","Ruiji Zhang","Yubin Xia","Chaobo Xu","Ran Hu","Licong Xu","Qijun Cai","Haoran Hua","Jing Sun","Jin Liu","Tian Qiu","Haowen Liu","Meng Hu","Xiuwen Li","Fei Gao","Yufu Wang","Lin Tie","Chaochao Wang","Jianping Lu","Cheng Sun","Yixin Wang","Shengjie Yang","Yuancheng Li","Lu Jin","Lisha Zhang","Fu Bian","Zhongkai Ye","Lidong Pei","Changyang Tu"],"pdf_url":"https://arxiv.org/pdf/2406.18045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03076v1","updated":"2024-07-03T12:50:49Z","published":"2024-07-03T12:50:49Z","title":"A Case Study on Context-Aware Neural Machine Translation with Multi-Task\n Learning","summary":" In document-level neural machine translation (DocNMT), multi-encoder\napproaches are common in encoding context and source sentences. Recent studies\n\\cite{li-etal-2020-multi-encoder} have shown that the context encoder generates\nnoise and makes the model robust to the choice of context. This paper further\ninvestigates this observation by explicitly modelling context encoding through\nmulti-task learning (MTL) to make the model sensitive to the choice of context.\nWe conduct experiments on cascade MTL architecture, which consists of one\nencoder and two decoders. Generation of the source from the context is\nconsidered an auxiliary task, and generation of the target from the source is\nthe main task. We experimented with German--English language pairs on News,\nTED, and Europarl corpora. Evaluation results show that the proposed MTL\napproach performs better than concatenation-based and multi-encoder DocNMT\nmodels in low-resource settings and is sensitive to the choice of context.\nHowever, we observe that the MTL models are failing to generate the source from\nthe context. These observations align with the previous studies, and this might\nsuggest that the available document-level parallel corpora are not\ncontext-aware, and a robust sentence-level model can outperform the\ncontext-aware models.\n","authors":["Ramakrishna Appicharla","Baban Gain","Santanu Pal","Asif Ekbal","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2407.03076v1.pdf","comment":"Accepted to EAMT 2024 (poster)"},{"id":"http://arxiv.org/abs/2407.01080v2","updated":"2024-07-03T12:49:34Z","published":"2024-07-01T08:35:04Z","title":"Face4RAG: Factual Consistency Evaluation for Retrieval Augmented\n Generation in Chinese","summary":" The prevailing issue of factual inconsistency errors in conventional\nRetrieval Augmented Generation (RAG) motivates the study of Factual Consistency\nEvaluation (FCE). Despite the various FCE methods proposed earlier, these\nmethods are evaluated on datasets generated by specific Large Language Models\n(LLMs). Without a comprehensive benchmark, it remains unexplored how these FCE\nmethods perform on other LLMs with different error distributions or even unseen\nerror types, as these methods may fail to detect the error types generated by\nother LLMs. To fill this gap, in this paper, we propose the first comprehensive\nFCE benchmark \\emph{Face4RAG} for RAG independent of the underlying LLM. Our\nbenchmark consists of a synthetic dataset built upon a carefully designed\ntypology for factuality inconsistency error and a real-world dataset\nconstructed from six commonly used LLMs, enabling evaluation of FCE methods on\nspecific error types or real-world error distributions. On the proposed\nbenchmark, we discover the failure of existing FCE methods to detect the\nlogical fallacy, which refers to a mismatch of logic structures between the\nanswer and the retrieved reference. To fix this issue, we further propose a new\nmethod called \\emph{L-Face4RAG} with two novel designs of logic-preserving\nanswer decomposition and fact-logic FCE. Extensive experiments show L-Face4RAG\nsubstantially outperforms previous methods for factual inconsistency detection\non a wide range of tasks, notably beyond the RAG task from which it is\noriginally motivated. Both the benchmark and our proposed method are publicly\navailable.\\footnote{\\url{https://huggingface.co/datasets/yq27/Face4RAG}\\label{link_face4rag}}\n","authors":["Yunqi Xu","Tianchi Cai","Jiyan Jiang","Xierui Song"],"pdf_url":"https://arxiv.org/pdf/2407.01080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21028v2","updated":"2024-07-03T12:49:23Z","published":"2024-05-31T17:16:38Z","title":"LACIE: Listener-Aware Finetuning for Confidence Calibration in Large\n Language Models","summary":" When answering questions, LLMs can convey not only an answer, but a level of\nconfidence about the answer being correct. This includes explicit confidence\nmarkers (e.g. giving a numeric score) as well as implicit markers, like an\nauthoritative tone or elaborating with additional knowledge. For LLMs to be\ntrustworthy knowledge sources, the confidence they convey should match their\nactual expertise; however, most current models tend towards overconfidence. To\ncalibrate both implicit and explicit confidence markers, we introduce a\npragmatic, listener-aware finetuning method (LACIE) that models the listener,\nconsidering not only whether an answer is right, but whether it will be\naccepted by a listener. We cast calibration as preference optimization,\ncreating data via a two-agent game, where a speaker model's outputs are judged\nby a simulated listener. We then finetune three LLMs (Mistral-7B, Llama3-8B,\nLlama3-70B) with LACIE, and show that the resulting models are better\ncalibrated w.r.t. a simulated listener. Crucially, these trends transfer to\nhuman listeners, helping them correctly predict model correctness: we conduct a\nhuman evaluation where annotators accept or reject an LLM's answers, finding\nthat training with LACIE results in 47% fewer incorrect answers being accepted\nwhile maintaining the same level of acceptance for correct answers.\nFurthermore, LACIE generalizes to another dataset, resulting in a large\nincrease in truthfulness on TruthfulQA when trained on TriviaQA. Our analysis\nindicates that LACIE leads to a better confidence separation between correct\nand incorrect examples. Qualitatively, we find that a LACIE-trained model\nhedges more and implicitly signals certainty when it is correct by using an\nauthoritative tone or including details. Finally, LACIE finetuning leads to an\nemergent increase in model abstention (e.g. saying \"I don't know\") for answers\nthat are likely wrong.\n","authors":["Elias Stengel-Eskin","Peter Hase","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2405.21028v2.pdf","comment":"18 pages. Code: https://github.com/esteng/pragmatic_calibration"},{"id":"http://arxiv.org/abs/2407.03061v1","updated":"2024-07-03T12:34:45Z","published":"2024-07-03T12:34:45Z","title":"ALTER: Augmentation for Large-Table-Based Reasoning","summary":" While extensive research has explored the use of large language models (LLMs)\nfor table-based reasoning, most approaches struggle with scalability when\napplied to large tables. To maintain the superior comprehension abilities of\nLLMs in these scenarios, we introduce ALTER(Augmentation for Large-Table-Based\nReasoning)-a framework designed to harness the latent augmentation potential in\nboth free-form natural language (NL) questions, via the query augmentor, and\nsemi-structured tabular data, through the table augmentor. By utilizing only a\nsmall subset of relevant data from the table and supplementing it with\npre-augmented schema, semantic, and literal information, ALTER achieves\noutstanding performance on table-based reasoning benchmarks. We also provide a\ndetailed analysis of large-table scenarios, comparing different methods and\nvarious partitioning principles. In these scenarios, our method outperforms all\nother approaches and exhibits robustness and efficiency against perturbations.\n","authors":["Han Zhang","Yuheng Ma","Hanfang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03051v1","updated":"2024-07-03T12:19:06Z","published":"2024-07-03T12:19:06Z","title":"Improving Conversational Abilities of Quantized Large Language Models\n via Direct Preference Alignment","summary":" The rapid advancement of large language models (LLMs) has facilitated their\ntransformation into conversational chatbots that can grasp contextual nuances\nand generate pertinent sentences, closely mirroring human values through\nadvanced techniques such as instruction tuning and reinforcement learning from\nhuman feedback (RLHF). However, the computational efficiency required for LLMs,\nachieved through techniques like post-training quantization (PTQ), presents\nchallenges such as token-flipping that can impair chatbot performance. In\nresponse, we propose a novel preference alignment approach, quantization-aware\ndirect preference optimization (QDPO), that aligns quantized LLMs with their\nfull-precision counterparts, improving conversational abilities. Evaluated on\ntwo instruction-tuned LLMs in various languages, QDPO demonstrated superior\nperformance in improving conversational abilities compared to established PTQ\nand knowledge-distillation fine-tuning techniques, marking a significant step\nforward in the development of efficient and effective conversational LLMs.\n","authors":["Janghwan Lee","Seongmin Park","Sukjin Hong","Minsoo Kim","Du-Seong Chang","Jungwook Choi"],"pdf_url":"https://arxiv.org/pdf/2407.03051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03045v1","updated":"2024-07-03T12:10:41Z","published":"2024-07-03T12:10:41Z","title":"JailbreakHunter: A Visual Analytics Approach for Jailbreak Prompts\n Discovery from Large-Scale Human-LLM Conversational Datasets","summary":" Large Language Models (LLMs) have gained significant attention but also\nraised concerns due to the risk of misuse. Jailbreak prompts, a popular type of\nadversarial attack towards LLMs, have appeared and constantly evolved to breach\nthe safety protocols of LLMs. To address this issue, LLMs are regularly updated\nwith safety patches based on reported jailbreak prompts. However, malicious\nusers often keep their successful jailbreak prompts private to exploit LLMs. To\nuncover these private jailbreak prompts, extensive analysis of large-scale\nconversational datasets is necessary to identify prompts that still manage to\nbypass the system's defenses. This task is highly challenging due to the\nimmense volume of conversation data, diverse characteristics of jailbreak\nprompts, and their presence in complex multi-turn conversations. To tackle\nthese challenges, we introduce JailbreakHunter, a visual analytics approach for\nidentifying jailbreak prompts in large-scale human-LLM conversational datasets.\nWe have designed a workflow with three analysis levels: group-level,\nconversation-level, and turn-level. Group-level analysis enables users to grasp\nthe distribution of conversations and identify suspicious conversations using\nmultiple criteria, such as similarity with reported jailbreak prompts in\nprevious research and attack success rates. Conversation-level analysis\nfacilitates the understanding of the progress of conversations and helps\ndiscover jailbreak prompts within their conversation contexts. Turn-level\nanalysis allows users to explore the semantic similarity and token overlap\nbetween a singleturn prompt and the reported jailbreak prompts, aiding in the\nidentification of new jailbreak strategies. The effectiveness and usability of\nthe system were verified through multiple case studies and expert interviews.\n","authors":["Zhihua Jin","Shiyi Liu","Haotian Li","Xun Zhao","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2407.03045v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.03040v1","updated":"2024-07-03T12:04:10Z","published":"2024-07-03T12:04:10Z","title":"Raw Text is All you Need: Knowledge-intensive Multi-turn Instruction\n Tuning for Large Language Model","summary":" Instruction tuning as an effective technique aligns the outputs of large\nlanguage models (LLMs) with human preference. But how to generate the seasonal\nmulti-turn dialogues from raw documents for instruction tuning still requires\nfurther exploration. In this paper, we present a novel framework named R2S that\nleverages the CoD-Chain of Dialogue logic to guide large language models (LLMs)\nin generating knowledge-intensive multi-turn dialogues for instruction tuning.\nBy integrating raw documents from both open-source datasets and domain-specific\nweb-crawled documents into a benchmark K-BENCH, we cover diverse areas such as\nWikipedia (English), Science (Chinese), and Artifacts (Chinese). Our approach\nfirst decides the logic flow of the current dialogue and then prompts LLMs to\nproduce key phrases for sourcing relevant response content. This methodology\nenables the creation of the G I NSTRUCT instruction dataset, retaining raw\ndocument knowledge within dialoguestyle interactions. Utilizing this dataset,\nwe fine-tune GLLM, a model designed to transform raw documents into structured\nmulti-turn dialogues, thereby injecting comprehensive domain knowledge into the\nSFT model for enhanced instruction tuning. This work signifies a stride towards\nrefining the adaptability and effectiveness of LLMs in processing and\ngenerating more accurate, contextually nuanced responses across various fields.\n","authors":["Xia Hou","Qifeng Li","Jian Yang","Tongliang Li","Linzheng Chai","Xianjie Wu","Hangyuan Ji","Zhoujun Li","Jixuan Nie","Jingbo Dun","Wenfeng Song"],"pdf_url":"https://arxiv.org/pdf/2407.03040v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.03038v1","updated":"2024-07-03T12:02:24Z","published":"2024-07-03T12:02:24Z","title":"On the Client Preference of LLM Fine-tuning in Federated Learning","summary":" Reinforcement learning with human feedback (RLHF) fine-tunes a pretrained\nlarge language model (LLM) using preference datasets, enabling the LLM to\ngenerate outputs that align with human preferences. Given the sensitive nature\nof these preference datasets held by various clients, there is a need to\nimplement RLHF within a federated learning (FL) framework, where clients are\nreluctant to share their data due to privacy concerns. To address this, we\nintroduce a feasible framework in which clients collaboratively train a binary\nselector with their preference datasets using our proposed FedBis. With a\nwell-trained selector, we can further enhance the LLM that generates\nhuman-preferred completions. Meanwhile, we propose a novel algorithm,\nFedBiscuit, that trains multiple selectors by organizing clients into balanced\nand disjoint clusters based on their preferences. Compared to the FedBis,\nFedBiscuit demonstrates superior performance in simulating human preferences\nfor pairwise completions. Our extensive experiments on federated human\npreference datasets -- marking the first benchmark to address heterogeneous\ndata partitioning among clients -- demonstrate that FedBiscuit outperforms\nFedBis and even surpasses traditional centralized training.\n","authors":["Feijie Wu","Xiaoze Liu","Haoyu Wang","Xingchen Wang","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2407.03038v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.00081v3","updated":"2024-07-03T12:00:37Z","published":"2023-07-31T18:53:47Z","title":"Towards Semantically Enriched Embeddings for Knowledge Graph Completion","summary":" Embedding based Knowledge Graph (KG) Completion has gained much attention\nover the past few years. Most of the current algorithms consider a KG as a\nmultidirectional labeled graph and lack the ability to capture the semantics\nunderlying the schematic information. In a separate development, a vast amount\nof information has been captured within the Large Language Models (LLMs) which\nhas revolutionized the field of Artificial Intelligence. KGs could benefit from\nthese LLMs and vice versa. This vision paper discusses the existing algorithms\nfor KG completion based on the variations for generating KG embeddings. It\nstarts with discussing various KG completion algorithms such as transductive\nand inductive link prediction and entity type prediction algorithms. It then\nmoves on to the algorithms utilizing type information within the KGs, LLMs, and\nfinally to algorithms capturing the semantics represented in different\ndescription logic axioms. We conclude the paper with a critical reflection on\nthe current state of work in the community and give recommendations for future\ndirections.\n","authors":["Mehwish Alam","Frank van Harmelen","Maribel Acosta"],"pdf_url":"https://arxiv.org/pdf/2308.00081v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03032v1","updated":"2024-07-03T11:54:11Z","published":"2024-07-03T11:54:11Z","title":"Strategies for Arabic Readability Modeling","summary":" Automatic readability assessment is relevant to building NLP applications for\neducation, content analysis, and accessibility. However, Arabic readability\nassessment is a challenging task due to Arabic's morphological richness and\nlimited readability resources. In this paper, we present a set of experimental\nresults on Arabic readability assessment using a diverse range of approaches,\nfrom rule-based methods to Arabic pretrained language models. We report our\nresults on a newly created corpus at different textual granularity levels\n(words and sentence fragments). Our results show that combining different\ntechniques yields the best results, achieving an overall macro F1 score of 86.7\nat the word level and 87.9 at the fragment level on a blind test set. We make\nour code, data, and pretrained models publicly available.\n","authors":["Juan Piñeros Liberato","Bashar Alhafni","Muhamed Al Khalil","Nizar Habash"],"pdf_url":"https://arxiv.org/pdf/2407.03032v1.pdf","comment":"Accepted to ArabicNLP 2024, ACL"},{"id":"http://arxiv.org/abs/2407.03020v1","updated":"2024-07-03T11:30:03Z","published":"2024-07-03T11:30:03Z","title":"Exploiting Dialect Identification in Automatic Dialectal Text\n Normalization","summary":" Dialectal Arabic is the primary spoken language used by native Arabic\nspeakers in daily communication. The rise of social media platforms has notably\nexpanded its use as a written language. However, Arabic dialects do not have\nstandard orthographies. This, combined with the inherent noise in\nuser-generated content on social media, presents a major challenge to NLP\napplications dealing with Dialectal Arabic. In this paper, we explore and\nreport on the task of CODAfication, which aims to normalize Dialectal Arabic\ninto the Conventional Orthography for Dialectal Arabic (CODA). We work with a\nunique parallel corpus of multiple Arabic dialects focusing on five major city\ndialects. We benchmark newly developed pretrained sequence-to-sequence models\non the task of CODAfication. We further show that using dialect identification\ninformation improves the performance across all dialects. We make our code,\ndata, and pretrained models publicly available.\n","authors":["Bashar Alhafni","Sarah Al-Towaity","Ziyad Fawzy","Fatema Nassar","Fadhl Eryani","Houda Bouamor","Nizar Habash"],"pdf_url":"https://arxiv.org/pdf/2407.03020v1.pdf","comment":"Accepted to ArabicNLP 2024, ACL"},{"id":"http://arxiv.org/abs/2402.01781v2","updated":"2024-07-03T11:20:43Z","published":"2024-02-01T19:12:25Z","title":"When Benchmarks are Targets: Revealing the Sensitivity of Large Language\n Model Leaderboards","summary":" Large Language Model (LLM) leaderboards based on benchmark rankings are\nregularly used to guide practitioners in model selection. Often, the published\nleaderboard rankings are taken at face value - we show this is a (potentially\ncostly) mistake. Under existing leaderboards, the relative performance of LLMs\nis highly sensitive to (often minute) details. We show that for popular\nmultiple-choice question benchmarks (e.g., MMLU), minor perturbations to the\nbenchmark, such as changing the order of choices or the method of answer\nselection, result in changes in rankings up to 8 positions. We explain this\nphenomenon by conducting systematic experiments over three broad categories of\nbenchmark perturbations and identifying the sources of this behavior. Our\nanalysis results in several best-practice recommendations, including the\nadvantage of a hybrid scoring method for answer selection. Our study highlights\nthe dangers of relying on simple benchmark evaluations and charts the path for\nmore robust evaluation schemes on the existing benchmarks. The code for this\npaper is available at\nhttps://github.com/National-Center-for-AI-Saudi-Arabia/lm-evaluation-harness.\n","authors":["Norah Alzahrani","Hisham Abdullah Alyahya","Yazeed Alnumay","Sultan Alrashed","Shaykhah Alsubaie","Yusef Almushaykeh","Faisal Mirza","Nouf Alotaibi","Nora Altwairesh","Areeb Alowisheq","M Saiful Bari","Haidar Khan"],"pdf_url":"https://arxiv.org/pdf/2402.01781v2.pdf","comment":"updated with ACL 2024 camera ready version"},{"id":"http://arxiv.org/abs/2407.03007v1","updated":"2024-07-03T11:06:05Z","published":"2024-07-03T11:06:05Z","title":"What Affects the Stability of Tool Learning? An Empirical Study on the\n Robustness of Tool Learning Frameworks","summary":" Tool learning methods have enhanced the ability of large language models\n(LLMs) to interact with real-world applications. Many existing works fine-tune\nLLMs or design prompts to enable LLMs to select appropriate tools and correctly\ninvoke them to meet user requirements. However, it is observed in previous\nworks that the performance of tool learning varies from tasks, datasets,\ntraining settings, and algorithms. Without understanding the impact of these\nfactors, it can lead to inconsistent results, inefficient model deployment, and\nsuboptimal tool utilization, ultimately hindering the practical integration and\nscalability of LLMs in real-world scenarios. Therefore, in this paper, we\nexplore the impact of both internal and external factors on the performance of\ntool learning frameworks. Through extensive experiments on two benchmark\ndatasets, we find several insightful conclusions for future work, including the\nobservation that LLMs can benefit significantly from increased trial and\nexploration. We believe our empirical study provides a new perspective for\nfuture tool learning research.\n","authors":["Chengrui Huang","Zhengliang Shi","Yuntao Wen","Xiuying Chen","Peng Han","Shen Gao","Shuo Shang"],"pdf_url":"https://arxiv.org/pdf/2407.03007v1.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.03005v1","updated":"2024-07-03T11:04:31Z","published":"2024-07-03T11:04:31Z","title":"Human-like Linguistic Biases in Neural Speech Models: Phonetic\n Categorization and Phonotactic Constraints in Wav2Vec2.0","summary":" What do deep neural speech models know about phonology? Existing work has\nexamined the encoding of individual linguistic units such as phonemes in these\nmodels. Here we investigate interactions between units. Inspired by classic\nexperiments on human speech perception, we study how Wav2Vec2 resolves\nphonotactic constraints. We synthesize sounds on an acoustic continuum between\n/l/ and /r/ and embed them in controlled contexts where only /l/, only /r/, or\nneither occur in English. Like humans, Wav2Vec2 models show a bias towards the\nphonotactically admissable category in processing such ambiguous sounds. Using\nsimple measures to analyze model internals on the level of individual stimuli,\nwe find that this bias emerges in early layers of the model's Transformer\nmodule. This effect is amplified by ASR finetuning but also present in fully\nself-supervised models. Our approach demonstrates how controlled stimulus\ndesigns can help localize specific linguistic knowledge in neural speech\nmodels.\n","authors":["Marianne de Heer Kloots","Willem Zuidema"],"pdf_url":"https://arxiv.org/pdf/2407.03005v1.pdf","comment":"Accepted to Interspeech 2024. For code and materials, see\n https://github.com/mdhk/phonotactic-sensitivity"},{"id":"http://arxiv.org/abs/2407.03004v1","updated":"2024-07-03T11:02:12Z","published":"2024-07-03T11:02:12Z","title":"SemioLLM: Assessing Large Language Models for Semiological Analysis in\n Epilepsy Research","summary":" Large Language Models have shown promising results in their ability to encode\ngeneral medical knowledge in standard medical question-answering datasets.\nHowever, their potential application in clinical practice requires evaluation\nin domain-specific tasks, where benchmarks are largely missing. In this study\nsemioLLM, we test the ability of state-of-the-art LLMs (GPT-3.5, GPT-4, Mixtral\n8x7B, and Qwen-72chat) to leverage their internal knowledge and reasoning for\nepilepsy diagnosis. Specifically, we obtain likelihood estimates linking\nunstructured text descriptions of seizures to seizure-generating brain regions,\nusing an annotated clinical database containing 1269 entries. We evaluate the\nLLM's performance, confidence, reasoning, and citation abilities in comparison\nto clinical evaluation. Models achieve above-chance classification performance\nwith prompt engineering significantly improving their outcome, with some models\nachieving close-to-clinical performance and reasoning. However, our analyses\nalso reveal significant pitfalls with several models being overly confident\nwhile showing poor performance, as well as exhibiting citation errors and\nhallucinations. In summary, our work provides the first extensive benchmark\ncomparing current SOTA LLMs in the medical domain of epilepsy and highlights\ntheir ability to leverage unstructured texts from patients' medical history to\naid diagnostic processes in health care.\n","authors":["Meghal Dani","Muthu Jeyanthi Prakash","Zeynep Akata","Stefanie Liebe"],"pdf_url":"https://arxiv.org/pdf/2407.03004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03000v1","updated":"2024-07-03T10:59:06Z","published":"2024-07-03T10:59:06Z","title":"VIVA: A Benchmark for Vision-Grounded Decision-Making with Human Values","summary":" This paper introduces VIVA, a benchmark for VIsion-grounded decision-making\ndriven by human VAlues. While most large vision-language models (VLMs) focus on\nphysical-level skills, our work is the first to examine their multimodal\ncapabilities in leveraging human values to make decisions under a\nvision-depicted situation. VIVA contains 1,062 images depicting diverse\nreal-world situations and the manually annotated decisions grounded in them.\nGiven an image there, the model should select the most appropriate action to\naddress the situation and provide the relevant human values and reason\nunderlying the decision. Extensive experiments based on VIVA show the\nlimitation of VLMs in using human values to make multimodal decisions. Further\nanalyses indicate the potential benefits of exploiting action consequences and\npredicted human values.\n","authors":["Zhe Hu","Yixiao Ren","Jing Li","Yu Yin"],"pdf_url":"https://arxiv.org/pdf/2407.03000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02996v1","updated":"2024-07-03T10:53:54Z","published":"2024-07-03T10:53:54Z","title":"Are Large Language Models Consistent over Value-laden Questions?","summary":" Large language models (LLMs) appear to bias their survey answers toward\ncertain values. Nonetheless, some argue that LLMs are too inconsistent to\nsimulate particular values. Are they? To answer, we first define value\nconsistency as the similarity of answers across (1) paraphrases of one\nquestion, (2) related questions under one topic, (3) multiple-choice and\nopen-ended use-cases of one question, and (4) multilingual translations of a\nquestion to English, Chinese, German, and Japanese. We apply these measures to\na few large ($>=34b$), open LLMs including llama-3, as well as gpt-4o, using\neight thousand questions spanning more than 300 topics. Unlike prior work, we\nfind that models are relatively consistent across paraphrases, use-cases,\ntranslations, and within a topic. Still, some inconsistencies remain. Models\nare more consistent on uncontroversial topics (e.g., in the U.S.,\n\"Thanksgiving\") than on controversial ones (\"euthanasia\"). Base models are both\nmore consistent compared to fine-tuned models and are uniform in their\nconsistency across topics, while fine-tuned models are more inconsistent about\nsome topics (\"euthanasia\") than others (\"women's rights\") like our human\nsubjects (n=165).\n","authors":["Jared Moore","Tanvi Deshpande","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02996v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.02987v1","updated":"2024-07-03T10:38:40Z","published":"2024-07-03T10:38:40Z","title":"LoRA-Guard: Parameter-Efficient Guardrail Adaptation for Content\n Moderation of Large Language Models","summary":" Guardrails have emerged as an alternative to safety alignment for content\nmoderation of large language models (LLMs). Existing model-based guardrails\nhave not been designed for resource-constrained computational portable devices,\nsuch as mobile phones, more and more of which are running LLM-based\napplications locally. We introduce LoRA-Guard, a parameter-efficient guardrail\nadaptation method that relies on knowledge sharing between LLMs and guardrail\nmodels. LoRA-Guard extracts language features from the LLMs and adapts them for\nthe content moderation task using low-rank adapters, while a dual-path design\nprevents any performance degradation on the generative task. We show that\nLoRA-Guard outperforms existing approaches with 100-1000x lower parameter\noverhead while maintaining accuracy, enabling on-device content moderation.\n","authors":["Hayder Elesedy","Pedro M. Esperança","Silviu Vlad Oprea","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2407.02987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02978v1","updated":"2024-07-03T10:22:23Z","published":"2024-07-03T10:22:23Z","title":"Mast Kalandar at SemEval-2024 Task 8: On the Trail of Textual Origins:\n RoBERTa-BiLSTM Approach to Detect AI-Generated Text","summary":" Large Language Models (LLMs) have showcased impressive abilities in\ngenerating fluent responses to diverse user queries. However, concerns\nregarding the potential misuse of such texts in journalism, educational, and\nacademic contexts have surfaced. SemEval 2024 introduces the task of\nMultigenerator, Multidomain, and Multilingual Black-Box Machine-Generated Text\nDetection, aiming to develop automated systems for identifying\nmachine-generated text and detecting potential misuse. In this paper, we i)\npropose a RoBERTa-BiLSTM based classifier designed to classify text into two\ncategories: AI-generated or human ii) conduct a comparative study of our model\nwith baseline approaches to evaluate its effectiveness. This paper contributes\nto the advancement of automatic text detection systems in addressing the\nchallenges posed by machine-generated text misuse. Our architecture ranked 46th\non the official leaderboard with an accuracy of 80.83 among 125.\n","authors":["Jainit Sushil Bafna","Hardik Mittal","Suyash Sethia","Manish Shrivastava","Radhika Mamidi"],"pdf_url":"https://arxiv.org/pdf/2407.02978v1.pdf","comment":"SemEval-2024"},{"id":"http://arxiv.org/abs/2407.02977v1","updated":"2024-07-03T10:21:27Z","published":"2024-07-03T10:21:27Z","title":"Large Language Models as Evaluators for Scientific Synthesis","summary":" Our study explores how well the state-of-the-art Large Language Models\n(LLMs), like GPT-4 and Mistral, can assess the quality of scientific summaries\nor, more fittingly, scientific syntheses, comparing their evaluations to those\nof human annotators. We used a dataset of 100 research questions and their\nsyntheses made by GPT-4 from abstracts of five related papers, checked against\nhuman quality ratings. The study evaluates both the closed-source GPT-4 and the\nopen-source Mistral model's ability to rate these summaries and provide reasons\nfor their judgments. Preliminary results show that LLMs can offer logical\nexplanations that somewhat match the quality ratings, yet a deeper statistical\nanalysis shows a weak correlation between LLM and human ratings, suggesting the\npotential and current limitations of LLMs in scientific synthesis evaluation.\n","authors":["Julia Evans","Jennifer D'Souza","Sören Auer"],"pdf_url":"https://arxiv.org/pdf/2407.02977v1.pdf","comment":"4 pages, forthcoming as part of the KONVENS 2024 proceedings\n https://konvens-2024.univie.ac.at/"},{"id":"http://arxiv.org/abs/2406.06331v2","updated":"2024-07-03T10:13:56Z","published":"2024-06-10T14:47:04Z","title":"MedExQA: Medical Question Answering Benchmark with Multiple Explanations","summary":" This paper introduces MedExQA, a novel benchmark in medical\nquestion-answering, to evaluate large language models' (LLMs) understanding of\nmedical knowledge through explanations. By constructing datasets across five\ndistinct medical specialties that are underrepresented in current datasets and\nfurther incorporating multiple explanations for each question-answer pair, we\naddress a major gap in current medical QA benchmarks which is the absence of\ncomprehensive assessments of LLMs' ability to generate nuanced medical\nexplanations. Our work highlights the importance of explainability in medical\nLLMs, proposes an effective methodology for evaluating models beyond\nclassification accuracy, and sheds light on one specific domain, speech\nlanguage pathology, where current LLMs including GPT4 lack good understanding.\nOur results show generation evaluation with multiple explanations aligns better\nwith human assessment, highlighting an opportunity for a more robust automated\ncomprehension assessment for LLMs. To diversify open-source medical LLMs\n(currently mostly based on Llama2), this work also proposes a new medical\nmodel, MedPhi-2, based on Phi-2 (2.7B). The model outperformed medical LLMs\nbased on Llama2-70B in generating explanations, showing its effectiveness in\nthe resource-constrained medical domain. We will share our benchmark datasets\nand the trained model.\n","authors":["Yunsoo Kim","Jinge Wu","Yusuf Abdulle","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2406.06331v2.pdf","comment":"Accepted to ACL2024 BioNLP Workshop"},{"id":"http://arxiv.org/abs/2407.02964v1","updated":"2024-07-03T10:01:01Z","published":"2024-07-03T10:01:01Z","title":"FSM: A Finite State Machine Based Zero-Shot Prompting Paradigm for\n Multi-Hop Question Answering","summary":" Large Language Models (LLMs) with chain-of-thought (COT) prompting have\ndemonstrated impressive abilities on simple nature language inference tasks.\nHowever, they tend to perform poorly on Multi-hop Question Answering (MHQA)\ntasks due to several challenges, including hallucination, error propagation and\nlimited context length. We propose a prompting method, Finite State Machine\n(FSM) to enhance the reasoning capabilities of LLM for complex tasks in\naddition to improved effectiveness and trustworthiness. Different from COT\nmethods, FSM addresses MHQA by iteratively decomposing a question into\nmulti-turn sub-questions, and self-correcting in time, improving the accuracy\nof answers in each step. Specifically, FSM addresses one sub-question at a time\nand decides on the next step based on its current result and state, in an\nautomaton-like format. Experiments on benchmarks show the effectiveness of our\nmethod. Although our method performs on par with the baseline on relatively\nsimpler datasets, it excels on challenging datasets like Musique. Moreover,\nthis approach mitigates the hallucination phenomenon, wherein the correct final\nanswer can be recovered despite errors in intermediate reasoning. Furthermore,\nour method improves LLMs' ability to follow specified output format\nrequirements, significantly reducing the difficulty of answer interpretation\nand the need for reformatting.\n","authors":["Xiaochen Wang","Junqing He","Zhe yang","Yiru Wang","Xiangdi Meng","Kunhao Pan","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2407.02964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01257v2","updated":"2024-07-03T09:54:08Z","published":"2024-07-01T13:07:01Z","title":"uDistil-Whisper: Label-Free Data Filtering for Knowledge Distillation\n via Large-Scale Pseudo Labelling","summary":" Recent work on distilling Whisper's knowledge into small models using\npseudo-labels shows promising performance while reducing the size by up to\n50\\%. This results in small, efficient, and dedicated models. However, a\ncritical step of distillation from pseudo-labels involves filtering\nhigh-quality predictions and using only those during training. This step\nrequires ground truth to compare and filter bad examples making the whole\nprocess supervised. In addition to that, the distillation process requires a\nlarge amount of data thereby limiting the ability to distil models in\nlow-resource settings. To address this challenge, we propose an unsupervised or\nlabel-free framework for distillation, thus eliminating the requirement for\nlabeled data altogether. Through experimentation, we show that our\nbest-distilled models outperform the teacher model by 5-7 points in terms of\nWER. Additionally, our models are on par with or better than similar supervised\ndata filtering setup. When we scale the data, our models significantly\noutperform all zero-shot and supervised models. We demonstrate that it is\npossible to distill large Whisper models into relatively small models without\nusing any labeled data. Our distilled models are 25-50\\% more compute and\nmemory efficient while maintaining performance equal to or better than the\nteacher model.\n","authors":["Abdul Waheed","Karima Kadaoui","Muhammad Abdul-Mageed"],"pdf_url":"https://arxiv.org/pdf/2407.01257v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.02960v1","updated":"2024-07-03T09:54:08Z","published":"2024-07-03T09:54:08Z","title":"ObfuscaTune: Obfuscated Offsite Fine-tuning and Inference of Proprietary\n LLMs on Private Datasets","summary":" This work addresses the timely yet underexplored problem of performing\ninference and finetuning of a proprietary LLM owned by a model provider entity\non the confidential/private data of another data owner entity, in a way that\nensures the confidentiality of both the model and the data. Hereby, the\nfinetuning is conducted offsite, i.e., on the computation infrastructure of a\nthird-party cloud provider. We tackle this problem by proposing ObfuscaTune, a\nnovel, efficient and fully utility-preserving approach that combines a simple\nyet effective obfuscation technique with an efficient usage of confidential\ncomputing (only 5% of the model parameters are placed on TEE). We empirically\ndemonstrate the effectiveness of ObfuscaTune by validating it on GPT-2 models\nwith different sizes on four NLP benchmark datasets. Finally, we compare to a\nna\\\"ive version of our approach to highlight the necessity of using random\nmatrices with low condition numbers in our approach to reduce errors induced by\nthe obfuscation.\n","authors":["Ahmed Frikha","Nassim Walha","Ricardo Mendes","Krishna Kanth Nakka","Xue Jiang","Xuebing Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02960v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2406.07016v2","updated":"2024-07-03T09:53:27Z","published":"2024-06-11T07:16:34Z","title":"Delving into ChatGPT usage in academic writing through excess vocabulary","summary":" Recent large language models (LLMs) can generate and revise text with\nhuman-level performance, and have been widely commercialized in systems like\nChatGPT. These models come with clear limitations: they can produce inaccurate\ninformation, reinforce existing biases, and be easily misused. Yet, many\nscientists have been using them to assist their scholarly writing. How\nwide-spread is LLM usage in the academic literature currently? To answer this\nquestion, we use an unbiased, large-scale approach, free from any assumptions\non academic LLM usage. We study vocabulary changes in 14 million PubMed\nabstracts from 2010-2024, and show how the appearance of LLMs led to an abrupt\nincrease in the frequency of certain style words. Our analysis based on excess\nwords usage suggests that at least 10% of 2024 abstracts were processed with\nLLMs. This lower bound differed across disciplines, countries, and journals,\nand was as high as 30% for some PubMed sub-corpora. We show that the appearance\nof LLM-based writing assistants has had an unprecedented impact in the\nscientific literature, surpassing the effect of major world events such as the\nCovid pandemic.\n","authors":["Dmitry Kobak","Rita González-Márquez","Emőke-Ágnes Horvát","Jan Lause"],"pdf_url":"https://arxiv.org/pdf/2406.07016v2.pdf","comment":"v2: Updating dataset, figures and numbers to include all PubMed\n abstracts until end of June 2024"},{"id":"http://arxiv.org/abs/2407.02956v1","updated":"2024-07-03T09:49:03Z","published":"2024-07-03T09:49:03Z","title":"IncogniText: Privacy-enhancing Conditional Text Anonymization via\n LLM-based Private Attribute Randomization","summary":" In this work, we address the problem of text anonymization where the goal is\nto prevent adversaries from correctly inferring private attributes of the\nauthor, while keeping the text utility, i.e., meaning and semantics. We propose\nIncogniText, a technique that anonymizes the text to mislead a potential\nadversary into predicting a wrong private attribute value. Our empirical\nevaluation shows a reduction of private attribute leakage by more than 90%.\nFinally, we demonstrate the maturity of IncogniText for real-world applications\nby distilling its anonymization capability into a set of LoRA parameters\nassociated with an on-device model.\n","authors":["Ahmed Frikha","Nassim Walha","Krishna Kanth Nakka","Ricardo Mendes","Xue Jiang","Xuebing Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02956v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.02943v1","updated":"2024-07-03T09:20:04Z","published":"2024-07-03T09:20:04Z","title":"PII-Compass: Guiding LLM training data extraction prompts towards the\n target PII via grounding","summary":" The latest and most impactful advances in large models stem from their\nincreased size. Unfortunately, this translates into an improved memorization\ncapacity, raising data privacy concerns. Specifically, it has been shown that\nmodels can output personal identifiable information (PII) contained in their\ntraining data. However, reported PIII extraction performance varies widely, and\nthere is no consensus on the optimal methodology to evaluate this risk,\nresulting in underestimating realistic adversaries. In this work, we\nempirically demonstrate that it is possible to improve the extractability of\nPII by over ten-fold by grounding the prefix of the manually constructed\nextraction prompt with in-domain data. Our approach, PII-Compass, achieves\nphone number extraction rates of 0.92%, 3.9%, and 6.86% with 1, 128, and 2308\nqueries, respectively, i.e., the phone number of 1 person in 15 is extractable.\n","authors":["Krishna Kanth Nakka","Ahmed Frikha","Ricardo Mendes","Xue Jiang","Xuebing Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02943v1.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2406.11925v2","updated":"2024-07-03T09:16:27Z","published":"2024-06-17T08:34:57Z","title":"DocCGen: Document-based Controlled Code Generation","summary":" Recent developments show that Large Language Models (LLMs) produce\nstate-of-the-art performance on natural language (NL) to code generation for\nresource-rich general-purpose languages like C++, Java, and Python. However,\ntheir practical usage for structured domain-specific languages (DSLs) such as\nYAML, JSON is limited due to domain-specific schema, grammar, and\ncustomizations generally unseen by LLMs during pre-training. Efforts have been\nmade to mitigate this challenge via in-context learning through relevant\nexamples or by fine-tuning. However, it suffers from problems, such as limited\nDSL samples and prompt sensitivity but enterprises maintain good documentation\nof the DSLs. Therefore, we propose DocCGen, a framework that can leverage such\nrich knowledge by breaking the NL-to-Code generation task for structured code\nlanguages into a two-step process. First, it detects the correct libraries\nusing the library documentation that best matches the NL query. Then, it\nutilizes schema rules extracted from the documentation of these libraries to\nconstrain the decoding. We evaluate our framework for two complex structured\nlanguages, Ansible YAML and Bash command, consisting of two settings:\nOut-of-domain (OOD) and In-domain (ID). Our extensive experiments show that\nDocCGen consistently improves different-sized language models across all six\nevaluation metrics, reducing syntactic and semantic errors in structured code.\nWe plan to open-source the datasets and code to motivate research in\nconstrained code generation.\n","authors":["Sameer Pimparkhede","Mehant Kammakomati","Srikanth Tamilselvam","Prince Kumar","Ashok Pon Kumar","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2406.11925v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02937v1","updated":"2024-07-03T09:12:53Z","published":"2024-07-03T09:12:53Z","title":"Probing the Feasibility of Multilingual Speaker Anonymization","summary":" In speaker anonymization, speech recordings are modified in a way that the\nidentity of the speaker remains hidden. While this technology could help to\nprotect the privacy of individuals around the globe, current research restricts\nthis by focusing almost exclusively on English data. In this study, we extend a\nstate-of-the-art anonymization system to nine languages by transforming\nlanguage-dependent components to their multilingual counterparts. Experiments\ntesting the robustness of the anonymized speech against privacy attacks and\nspeech deterioration show an overall success of this system for all languages.\nThe results suggest that speaker embeddings trained on English data can be\napplied across languages, and that the anonymization performance for a language\nis mainly affected by the quality of the speech synthesis component used for\nit.\n","authors":["Sarina Meyer","Florian Lux","Ngoc Thang Vu"],"pdf_url":"https://arxiv.org/pdf/2407.02937v1.pdf","comment":"accepted at Interspeech 2024"},{"id":"http://arxiv.org/abs/2407.02936v1","updated":"2024-07-03T09:12:38Z","published":"2024-07-03T09:12:38Z","title":"GraCoRe: Benchmarking Graph Comprehension and Complex Reasoning in Large\n Language Models","summary":" Evaluating the graph comprehension and reasoning abilities of Large Language\nModels (LLMs) is challenging and often incomplete. Existing benchmarks focus\nprimarily on pure graph understanding, lacking a comprehensive evaluation\nacross all graph types and detailed capability definitions. This paper presents\nGraCoRe, a benchmark for systematically assessing LLMs' graph comprehension and\nreasoning. GraCoRe uses a three-tier hierarchical taxonomy to categorize and\ntest models on pure graph and heterogeneous graphs, subdividing capabilities\ninto 10 distinct areas tested through 19 tasks. Our benchmark includes 11\ndatasets with 5,140 graphs of varying complexity. We evaluated three\nclosed-source and seven open-source LLMs, conducting thorough analyses from\nboth ability and task perspectives. Key findings reveal that semantic\nenrichment enhances reasoning performance, node ordering impacts task success,\nand the ability to process longer texts does not necessarily improve graph\ncomprehension or reasoning. GraCoRe is open-sourced at\nhttps://github.com/ZIKEYUAN/GraCoRe\n","authors":["Zike Yuan","Ming Liu","Hui Wang","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2407.02936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02917v1","updated":"2024-07-03T08:49:18Z","published":"2024-07-03T08:49:18Z","title":"Towards Negotiative Dialogue for the Talkamatic Dialogue Manager","summary":" The paper describes a number of dialogue phenomena associated with\nnegotiative dialogue, as implemented in a development version of the Talkamatic\nDialogue Manager (TDM). This implementation is an initial step towards full\ncoverage of general features of negotiative dialogue in TDM.\n","authors":["Staffan Larsson","Alexander Berman","David Hjelm"],"pdf_url":"https://arxiv.org/pdf/2407.02917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12390v4","updated":"2024-07-03T08:44:45Z","published":"2024-04-18T17:59:54Z","title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","summary":" We introduce Blink, a new benchmark for multimodal language models (LLMs)\nthat focuses on core visual perception abilities not found in other\nevaluations. Most of the Blink tasks can be solved by humans \"within a blink\"\n(e.g., relative depth estimation, visual correspondence, forensics detection,\nand multi-view reasoning). However, we find these perception-demanding tasks\ncast significant challenges for current multimodal LLMs because they resist\nmediation through natural language. Blink reformats 14 classic computer vision\ntasks into 3,807 multiple-choice questions, paired with single or multiple\nimages and visual prompting. While humans get 95.70% accuracy on average, Blink\nis surprisingly challenging for existing multimodal LLMs: even the\nbest-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only\n13.17% and 7.63% higher than random guessing, indicating that such perception\nabilities have not \"emerged\" yet in recent multimodal LLMs. Our analysis also\nhighlights that specialist CV models could solve these problems much better,\nsuggesting potential pathways for future improvements. We believe Blink will\nstimulate the community to help multimodal LLMs catch up with human-level\nvisual perception.\n","authors":["Xingyu Fu","Yushi Hu","Bangzheng Li","Yu Feng","Haoyu Wang","Xudong Lin","Dan Roth","Noah A. Smith","Wei-Chiu Ma","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.12390v4.pdf","comment":"Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/,\n ECCV 2024"},{"id":"http://arxiv.org/abs/2403.07794v3","updated":"2024-07-03T08:18:44Z","published":"2024-03-12T16:33:30Z","title":"Fine-tuning Large Language Models with Sequential Instructions","summary":" Despite the success of existing instruction-tuned models, we find that they\nusually struggle to respond to queries with multiple instructions. This impairs\ntheir performance in complex problems whose solution consists of multiple\nintermediate tasks. Thus, we contend that part of the fine-tuning data mixture\nshould be sequential--containing a chain of interrelated tasks. We first\napproach sequential instruction tuning from a task-driven perspective, manually\ncreating interpretable intermediate tasks for multilingual and visual question\nanswering: namely \"translate then predict\" and \"caption then answer\". Next, we\nautomate this process by turning instructions in existing datasets (e.g.,\nAlpaca and FlanCoT) into diverse and complex sequential instructions, making\nour method general-purpose. Models that underwent our sequential instruction\ntuning show improved results in coding, maths, and open-ended generation.\nMoreover, we put forward a new benchmark named SeqEval to evaluate a model's\nability to follow all the instructions in a sequence, which further\ncorroborates the benefits of our fine-tuning method. We hope that our\nendeavours will open new research avenues on instruction tuning for complex\ntasks.\n","authors":["Hanxu Hu","Simon Yu","Pinzhen Chen","Edoardo M. Ponti"],"pdf_url":"https://arxiv.org/pdf/2403.07794v3.pdf","comment":"21pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.02894v1","updated":"2024-07-03T08:15:39Z","published":"2024-07-03T08:15:39Z","title":"Translatotron-V(ison): An End-to-End Model for In-Image Machine\n Translation","summary":" In-image machine translation (IIMT) aims to translate an image containing\ntexts in source language into an image containing translations in target\nlanguage. In this regard, conventional cascaded methods suffer from issues such\nas error propagation, massive parameters, and difficulties in deployment and\nretaining visual characteristics of the input image. Thus, constructing\nend-to-end models has become an option, which, however, faces two main\nchallenges: 1) the huge modeling burden, as it is required to simultaneously\nlearn alignment across languages and preserve the visual characteristics of the\ninput image; 2) the difficulties of directly predicting excessively lengthy\npixel sequences. In this paper, we propose \\textit{Translatotron-V(ision)}, an\nend-to-end IIMT model consisting of four modules. In addition to an image\nencoder, and an image decoder, our model contains a target text decoder and an\nimage tokenizer. Among them, the target text decoder is used to alleviate the\nlanguage alignment burden, and the image tokenizer converts long sequences of\npixels into shorter sequences of visual tokens, preventing the model from\nfocusing on low-level visual features. Besides, we present a two-stage training\nframework for our model to assist the model in learning alignment across\nmodalities and languages. Finally, we propose a location-aware evaluation\nmetric called Structure-BLEU to assess the translation quality of the generated\nimages. Experimental results demonstrate that our model achieves competitive\nperformance compared to cascaded models with only 70.9\\% of parameters, and\nsignificantly outperforms the pixel-level end-to-end IIMT model.\n","authors":["Zhibin Lan","Liqiang Niu","Fandong Meng","Jie Zhou","Min Zhang","Jinsong Su"],"pdf_url":"https://arxiv.org/pdf/2407.02894v1.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2407.02891v1","updated":"2024-07-03T08:08:01Z","published":"2024-07-03T08:08:01Z","title":"GPTQT: Quantize Large Language Models Twice to Push the Efficiency","summary":" Due to their large size, generative Large Language Models (LLMs) require\nsignificant computing and storage resources. This paper introduces a new\npost-training quantization method, GPTQT, to reduce memory usage and enhance\nprocessing speed by expressing the weight of LLM in 3bit/2bit. Practice has\nshown that minimizing the quantization error of weights is ineffective, leading\nto overfitting. Therefore, GPTQT employs a progressive two-step approach:\ninitially quantizing weights using Linear quantization to a relatively high\nbit, followed by converting obtained int weight to lower bit binary coding. A\nre-explore strategy is proposed to optimize initial scaling factor. During\ninference, these steps are merged into pure binary coding, enabling efficient\ncomputation. Testing across various models and datasets confirms GPTQT's\neffectiveness. Compared to the strong 3-bit quantization baseline, GPTQT\nfurther reduces perplexity by 4.01 on opt-66B and increases speed by 1.24 times\non opt-30b. The results on Llama2 show that GPTQT is currently the best binary\ncoding quantization method for such kind of LLMs.\n","authors":["Yipin Guo","Yilin Lang","Qinyuan Ren"],"pdf_url":"https://arxiv.org/pdf/2407.02891v1.pdf","comment":"Accepted by 11th IEEE International Conference on Cybernetics and\n Intelligent Systems"},{"id":"http://arxiv.org/abs/2407.02885v1","updated":"2024-07-03T07:59:52Z","published":"2024-07-03T07:59:52Z","title":"CogErgLLM: Exploring Large Language Model Systems Design Perspective\n Using Cognitive Ergonomics","summary":" Integrating cognitive ergonomics with LLMs is essential for enhancing safety,\nreliability, and user satisfaction in human-AI interactions. Current LLM design\noften lacks this integration, leading to systems that may not fully align with\nhuman cognitive capabilities and limitations. Insufficient focus on\nincorporating cognitive science methods exacerbates biases in LLM outputs,\nwhile inconsistent application of user-centered design principles results in\nsub-optimal user experiences. To address these challenges, our position paper\nexplores the critical integration of cognitive ergonomics principles into LLM\ndesign, aiming to provide a comprehensive framework and practical guidelines\nfor ethical LLM development. Through our contributions, we seek to advance\nunderstanding and practice in integrating cognitive ergonomics into LLM\nsystems, fostering safer, more reliable, and ethically sound human-AI\ninteractions.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2407.02885v1.pdf","comment":"8 Page, 3 Figures. Accepted to Large Language Models and Cognition @\n ICML 2024 (https://llm-cognition.github.io/#:~:text=CogErgLLM)"},{"id":"http://arxiv.org/abs/2407.02883v1","updated":"2024-07-03T07:58:20Z","published":"2024-07-03T07:58:20Z","title":"CoIR: A Comprehensive Benchmark for Code Information Retrieval Models","summary":" Despite the substantial success of Information Retrieval (IR) in various NLP\ntasks, most IR systems predominantly handle queries and corpora in natural\nlanguage, neglecting the domain of code retrieval. Code retrieval is critically\nimportant yet remains under-explored, with existing methods and benchmarks\ninadequately representing the diversity of code in various domains and tasks.\nAddressing this gap, we present \\textbf{\\name} (\\textbf{Co}de\n\\textbf{I}nformation \\textbf{R}etrieval Benchmark), a robust and comprehensive\nbenchmark specifically designed to assess code retrieval capabilities. \\name\ncomprises \\textbf{ten} meticulously curated code datasets, spanning\n\\textbf{eight} distinctive retrieval tasks across \\textbf{seven} diverse\ndomains. We first discuss the construction of \\name and its diverse dataset\ncomposition. Further, we evaluate nine widely used retrieval models using\n\\name, uncovering significant difficulties in performing code retrieval tasks\neven with state-of-the-art systems. To facilitate easy adoption and integration\nwithin existing research workflows, \\name has been developed as a user-friendly\nPython framework, readily installable via pip. It shares same data schema as\nother popular benchmarks like MTEB and BEIR, enabling seamless cross-benchmark\nevaluations. Through \\name, we aim to invigorate research in the code retrieval\ndomain, providing a versatile benchmarking tool that encourages further\ndevelopment and exploration of code retrieval systems\\footnote{\\url{\nhttps://github.com/CoIR-team/coir}}.\n","authors":["Xiangyang Li","Kuicai Dong","Yi Quan Lee","Wei Xia","Yichun Yin","Hao Zhang","Yong Liu","Yasheng Wang","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2407.02883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02867v1","updated":"2024-07-03T07:31:33Z","published":"2024-07-03T07:31:33Z","title":"Contrast then Memorize: Semantic Neighbor Retrieval-Enhanced Inductive\n Multimodal Knowledge Graph Completion","summary":" A large number of studies have emerged for Multimodal Knowledge Graph\nCompletion (MKGC) to predict the missing links in MKGs. However, fewer studies\nhave been proposed to study the inductive MKGC (IMKGC) involving emerging\nentities unseen during training. Existing inductive approaches focus on\nlearning textual entity representations, which neglect rich semantic\ninformation in visual modality. Moreover, they focus on aggregating structural\nneighbors from existing KGs, which of emerging entities are usually limited.\nHowever, the semantic neighbors are decoupled from the topology linkage and\nusually imply the true target entity. In this paper, we propose the IMKGC task\nand a semantic neighbor retrieval-enhanced IMKGC framework CMR, where the\ncontrast brings the helpful semantic neighbors close, and then the memorize\nsupports semantic neighbor retrieval to enhance inference. Specifically, we\nfirst propose a unified cross-modal contrastive learning to simultaneously\ncapture the textual-visual and textual-textual correlations of query-entity\npairs in a unified representation space. The contrastive learning increases the\nsimilarity of positive query-entity pairs, therefore making the representations\nof helpful semantic neighbors close. Then, we explicitly memorize the knowledge\nrepresentations to support the semantic neighbor retrieval. At test time, we\nretrieve the nearest semantic neighbors and interpolate them to the\nquery-entity similarity distribution to augment the final prediction. Extensive\nexperiments validate the effectiveness of CMR on three inductive MKGC datasets.\nCodes are available at https://github.com/OreOZhao/CMR.\n","authors":["Yu Zhao","Ying Zhang","Baohang Zhou","Xinying Qian","Kehui Song","Xiangrui Cai"],"pdf_url":"https://arxiv.org/pdf/2407.02867v1.pdf","comment":"Accepted by SIGIR 2024"},{"id":"http://arxiv.org/abs/2407.02855v1","updated":"2024-07-03T07:14:05Z","published":"2024-07-03T07:14:05Z","title":"Safe Unlearning: A Surprisingly Effective and Generalizable Solution to\n Defend Against Jailbreak Attacks","summary":" LLMs are known to be vulnerable to jailbreak attacks, even after safety\nalignment. An important observation is that, while different types of jailbreak\nattacks can generate significantly different queries, they mostly result in\nsimilar responses that are rooted in the same harmful knowledge (e.g., detailed\nsteps to make a bomb). Therefore, we conjecture that directly unlearn the\nharmful knowledge in the LLM can be a more effective way to defend against\njailbreak attacks than the mainstream supervised fine-tuning (SFT) based\napproaches. Our extensive experiments confirmed our insight and suggested\nsurprising generalizability of our unlearning-based approach: using only 20 raw\nharmful questions \\emph{without} any jailbreak prompt during training, our\nsolution reduced the Attack Success Rate (ASR) in Vicuna-7B on\n\\emph{out-of-distribution} (OOD) harmful questions wrapped with various complex\njailbreak prompts from 82.6\\% to 7.7\\%. This significantly outperforms\nLlama2-7B-Chat, which is fine-tuned on about 0.1M safety alignment samples but\nstill has an ASR of 21.9\\% even under the help of an additional safety system\nprompt. Further analysis reveals that the generalization ability of our\nsolution stems from the intrinsic relatedness among harmful responses across\nharmful questions (e.g., response patterns, shared steps and actions, and\nsimilarity among their learned representations in the LLM). Our code is\navailable at \\url{https://github.com/thu-coai/SafeUnlearning}.\n","authors":["Zhexin Zhang","Junxiao Yang","Pei Ke","Shiyao Cui","Chujie Zheng","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2407.02855v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2407.02854v1","updated":"2024-07-03T07:12:36Z","published":"2024-07-03T07:12:36Z","title":"Universal Gloss-level Representation for Gloss-free Sign Language\n Translation and Production","summary":" Sign language, essential for the deaf and hard-of-hearing, presents unique\nchallenges in translation and production due to its multimodal nature and the\ninherent ambiguity in mapping sign language motion to spoken language words.\nPrevious methods often rely on gloss annotations, requiring time-intensive\nlabor and specialized expertise in sign language. Gloss-free methods have\nemerged to address these limitations, but they often depend on external sign\nlanguage data or dictionaries, failing to completely eliminate the need for\ngloss annotations. There is a clear demand for a comprehensive approach that\ncan supplant gloss annotations and be utilized for both Sign Language\nTranslation (SLT) and Sign Language Production (SLP). We introduce Universal\nGloss-level Representation (UniGloR), a unified and self-supervised solution\nfor both SLT and SLP, trained on multiple datasets including PHOENIX14T,\nHow2Sign, and NIASL2021. Our results demonstrate UniGloR's effectiveness in the\ntranslation and production tasks. We further report an encouraging result for\nthe Sign Language Recognition (SLR) on previously unseen data. Our study\nsuggests that self-supervised learning can be made in a unified manner, paving\nthe way for innovative and practical applications in future research.\n","authors":["Eui Jun Hwang","Sukmin Cho","Huije Lee","Youngwoo Yoon","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2407.02854v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.00072v2","updated":"2024-07-03T06:54:16Z","published":"2024-06-21T08:52:11Z","title":"Pistis-RAG: A Scalable Cascading Framework Towards Content-Centric\n Retrieval-Augmented Generation","summary":" In Greek mythology, Pistis symbolized good faith, trust, and reliability.\nDrawing inspiration from these principles, Pistis-RAG is a scalable multi-stage\nframework designed to address the challenges of large-scale retrieval-augmented\ngeneration (RAG) systems. This framework consists of distinct stages: matching,\npre-ranking, ranking, reasoning, and aggregating. Each stage contributes to\nnarrowing the search space, prioritizing semantically relevant documents,\naligning with the large language model's (LLM) preferences, supporting complex\nchain-of-thought (CoT) methods, and combining information from multiple\nsources.\n Our ranking stage introduces a significant innovation by recognizing that\nsemantic relevance alone may not lead to improved generation quality, due to\nthe sensitivity of the few-shot prompt order, as noted in previous research.\nThis critical aspect is often overlooked in current RAG frameworks.\n We argue that the alignment issue between LLMs and external knowledge ranking\nmethods is tied to the model-centric paradigm dominant in RAG systems. We\npropose a content-centric approach, emphasizing seamless integration between\nLLMs and external information sources to optimize content transformation for\nspecific tasks.\n Our novel ranking stage is designed specifically for RAG systems,\nincorporating principles of information retrieval while considering the unique\nbusiness scenarios reflected in LLM preferences and user feedback. We simulated\nfeedback signals on the MMLU benchmark, resulting in a 9.3% performance\nimprovement. Our model and code will be open-sourced on GitHub. Additionally,\nexperiments on real-world, large-scale data validate the scalability of our\nframework.\n","authors":["Yu Bai","Yukai Miao","Li Chen","Dan Li","Yanyu Ren","Hongtao Xie","Ce Yang","Xuhui Cai"],"pdf_url":"https://arxiv.org/pdf/2407.00072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16029v5","updated":"2024-07-03T06:39:59Z","published":"2024-02-25T08:41:32Z","title":"GraphWiz: An Instruction-Following Language Model for Graph Problems","summary":" Large language models (LLMs) have achieved impressive success across several\nfields, but their proficiency in understanding and resolving complex graph\nproblems is less explored. To bridge this gap, we introduce GraphInstruct, a\nnovel and comprehensive instruction-tuning dataset designed to equip language\nmodels with the ability to tackle a broad spectrum of graph problems using\nexplicit reasoning paths. Utilizing GraphInstruct, we build GraphWiz, an\nopen-source language model capable of resolving various graph problem types\nwhile generating clear reasoning processes. To enhance the model's capability\nand reliability, we incorporate the Direct Preference Optimization (DPO)\nframework into the graph problem-solving context. The enhanced model,\nGraphWiz-DPO, achieves an average accuracy of 65% across nine tasks with\ndifferent complexity levels, surpassing GPT-4 which has an average accuracy of\n43.8%. Moreover, our research delves into the delicate balance between training\ndata volume and model performance, highlighting the potential for overfitting\nwith increased data. We also explore the transferability of the model's\nreasoning ability across different graph tasks, indicating the model's\nadaptability and practical application potential. Our investigation offers a\nnew blueprint and valuable insights for developing LLMs specialized in graph\nreasoning and problem-solving.\n","authors":["Nuo Chen","Yuhan Li","Jianheng Tang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2402.16029v5.pdf","comment":"27pages, 15 tables"},{"id":"http://arxiv.org/abs/2407.02842v1","updated":"2024-07-03T06:39:18Z","published":"2024-07-03T06:39:18Z","title":"MindBench: A Comprehensive Benchmark for Mind Map Structure Recognition\n and Analysis","summary":" Multimodal Large Language Models (MLLM) have made significant progress in the\nfield of document analysis. Despite this, existing benchmarks typically focus\nonly on extracting text and simple layout information, neglecting the complex\ninteractions between elements in structured documents such as mind maps and\nflowcharts. To address this issue, we introduce the new benchmark named\nMindBench, which not only includes meticulously constructed bilingual authentic\nor synthetic images, detailed annotations, evaluation metrics and baseline\nmodels, but also specifically designs five types of structured understanding\nand parsing tasks. These tasks include full parsing, partial parsing,\nposition-related parsing, structured Visual Question Answering (VQA), and\nposition-related VQA, covering key areas such as text recognition, spatial\nawareness, relationship discernment, and structured parsing. Extensive\nexperimental results demonstrate the substantial potential and significant room\nfor improvement in current models' ability to handle structured document\ninformation. We anticipate that the launch of MindBench will significantly\nadvance research and application development in structured document analysis\ntechnology. MindBench is available at:\nhttps://miasanlei.github.io/MindBench.github.io/.\n","authors":["Lei Chen","Feng Yan","Yujie Zhong","Shaoxiang Chen","Zequn Jie","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2407.02842v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2407.02837v1","updated":"2024-07-03T06:32:03Z","published":"2024-07-03T06:32:03Z","title":"Comparing Feature-based and Context-aware Approaches to PII\n Generalization Level Prediction","summary":" Protecting Personal Identifiable Information (PII) in text data is crucial\nfor privacy, but current PII generalization methods face challenges such as\nuneven data distributions and limited context awareness. To address these\nissues, we propose two approaches: a feature-based method using machine\nlearning to improve performance on structured inputs, and a novel context-aware\nframework that considers the broader context and semantic relationships between\nthe original text and generalized candidates. The context-aware approach\nemploys Multilingual-BERT for text representation, functional transformations,\nand mean squared error scoring to evaluate candidates. Experiments on the\nWikiReplace dataset demonstrate the effectiveness of both methods, with the\ncontext-aware approach outperforming the feature-based one across different\nscales. This work contributes to advancing PII generalization techniques by\nhighlighting the importance of feature selection, ensemble learning, and\nincorporating contextual information for better privacy protection in text\nanonymization.\n","authors":["Kailin Zhang","Xinying Qiu"],"pdf_url":"https://arxiv.org/pdf/2407.02837v1.pdf","comment":"Accepted to IALP 2024"},{"id":"http://arxiv.org/abs/2407.02834v1","updated":"2024-07-03T06:21:07Z","published":"2024-07-03T06:21:07Z","title":"Aspect-Based Sentiment Analysis Techniques: A Comparative Study","summary":" Since the dawn of the digitalisation era, customer feedback and online\nreviews are unequivocally major sources of insights for businesses.\nConsequently, conducting comparative analyses of such sources has become the de\nfacto modus operandi of any business that wishes to give itself a competitive\nedge over its peers and improve customer loyalty. Sentiment analysis is one\nsuch method instrumental in gauging public interest, exposing market trends,\nand analysing competitors. While traditional sentiment analysis focuses on\noverall sentiment, as the needs advance with time, it has become important to\nexplore public opinions and sentiments on various specific subjects, products\nand services mentioned in the reviews on a finer-granular level. To this end,\nAspect-based Sentiment Analysis (ABSA), supported by advances in Artificial\nIntelligence (AI) techniques which have contributed to a paradigm shift from\nsimple word-level analysis to tone and context-aware analyses, focuses on\nidentifying specific aspects within the text and determining the sentiment\nassociated with each aspect. In this study, we compare several deep-NN methods\nfor ABSA on two benchmark datasets (Restaurant14 and Laptop-14) and found that\nFAST LSA obtains the best overall results of 87.6% and 82.6% accuracy but does\nnot pass LSA+DeBERTa which reports 90.33% and 86.21% accuracy respectively.\n","authors":["Dineth Jayakody","Koshila Isuranda","A V A Malkith","Nisansa de Silva","Sachintha Rajith Ponnamperuma","G G N Sandamali","K L K Sudheera"],"pdf_url":"https://arxiv.org/pdf/2407.02834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02833v1","updated":"2024-07-03T06:20:31Z","published":"2024-07-03T06:20:31Z","title":"LANE: Logic Alignment of Non-tuning Large Language Models and Online\n Recommendation Systems for Explainable Reason Generation","summary":" The explainability of recommendation systems is crucial for enhancing user\ntrust and satisfaction. Leveraging large language models (LLMs) offers new\nopportunities for comprehensive recommendation logic generation. However, in\nexisting related studies, fine-tuning LLM models for recommendation tasks\nincurs high computational costs and alignment issues with existing systems,\nlimiting the application potential of proven proprietary/closed-source LLM\nmodels, such as GPT-4. In this work, our proposed effective strategy LANE\naligns LLMs with online recommendation systems without additional LLMs tuning,\nreducing costs and improving explainability. This innovative approach addresses\nkey challenges in integrating language models with recommendation systems while\nfully utilizing the capabilities of powerful proprietary models. Specifically,\nour strategy operates through several key components: semantic embedding, user\nmulti-preference extraction using zero-shot prompting, semantic alignment, and\nexplainable recommendation generation using Chain of Thought (CoT) prompting.\nBy embedding item titles instead of IDs and utilizing multi-head attention\nmechanisms, our approach aligns the semantic features of user preferences with\nthose of candidate items, ensuring coherent and user-aligned recommendations.\nSufficient experimental results including performance comparison, questionnaire\nvoting, and visualization cases prove that our method can not only ensure\nrecommendation performance, but also provide easy-to-understand and reasonable\nrecommendation logic.\n","authors":["Hongke Zhao","Songming Zheng","Likang Wu","Bowen Yu","Jing Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12736v2","updated":"2024-07-03T06:13:31Z","published":"2023-12-20T03:18:50Z","title":"Learning and Forgetting Unsafe Examples in Large Language Models","summary":" As the number of large language models (LLMs) released to the public grows,\nthere is a pressing need to understand the safety implications associated with\nthese models learning from third-party custom finetuning data. We explore the\nbehavior of LLMs finetuned on noisy custom data containing unsafe content,\nrepresented by datasets that contain biases, toxicity, and harmfulness, finding\nthat while aligned LLMs can readily learn this unsafe content, they also tend\nto forget it more significantly than other examples when subsequently finetuned\non safer content. Drawing inspiration from the discrepancies in forgetting, we\nintroduce the \"ForgetFilter\" algorithm, which filters unsafe data based on how\nstrong the model's forgetting signal is for that data. We demonstrate that the\nForgetFilter algorithm ensures safety in customized finetuning without\ncompromising downstream task performance, unlike sequential safety finetuning.\nForgetFilter outperforms alternative strategies like replay and moral\nself-correction in curbing LLMs' ability to assimilate unsafe content during\ncustom finetuning, e.g. 75% lower than not applying any safety measures and 62%\nlower than using self-correction in toxicity score.\n","authors":["Jiachen Zhao","Zhun Deng","David Madras","James Zou","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2312.12736v2.pdf","comment":"accepted by ICML 24"},{"id":"http://arxiv.org/abs/2402.13516v4","updated":"2024-07-03T05:56:49Z","published":"2024-02-21T03:58:49Z","title":"ProSparse: Introducing and Enhancing Intrinsic Activation Sparsity\n within Large Language Models","summary":" Activation sparsity refers to the existence of considerable\nweakly-contributed elements among activation outputs. As a prevalent property\nof the models using the ReLU activation function, activation sparsity has been\nproven a promising paradigm to boost model inference efficiency. Nevertheless,\nmost large language models (LLMs) adopt activation functions without intrinsic\nactivation sparsity (e.g., GELU and Swish). Some recent efforts have explored\nintroducing ReLU or its variants as the substitutive activation function to\nhelp LLMs achieve activation sparsity and inference acceleration, but few can\nsimultaneously obtain high sparsity and comparable model performance. This\npaper introduces a simple and effective sparsification method named \"ProSparse\"\nto push LLMs for higher activation sparsity while maintaining comparable\nperformance. Specifically, after substituting the activation function of LLMs\nwith ReLU, ProSparse adopts progressive sparsity regularization with a factor\nsmoothly increasing along the multi-stage sine curves. This can enhance\nactivation sparsity and mitigate performance degradation by avoiding radical\nshifts in activation distributions. With ProSparse, we obtain high sparsity of\n89.32% for LLaMA2-7B, 88.80% for LLaMA2-13B, and 87.89% for end-size\nMiniCPM-1B, respectively, achieving comparable performance to their original\nSwish-activated versions. These present the most sparsely activated models\namong open-source LLaMA versions and competitive end-size models, considerably\nsurpassing ReluLLaMA-7B (66.98%) and ReluLLaMA-13B (71.56%). Our inference\nacceleration experiments further demonstrate the significant practical\nacceleration potential of LLMs with higher activation sparsity, obtaining up to\n4.52$\\times$ inference speedup.\n","authors":["Chenyang Song","Xu Han","Zhengyan Zhang","Shengding Hu","Xiyu Shi","Kuai Li","Chen Chen","Zhiyuan Liu","Guangli Li","Tao Yang","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.13516v4.pdf","comment":"19 pages, 4 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.02820v1","updated":"2024-07-03T05:42:20Z","published":"2024-07-03T05:42:20Z","title":"Investigating the Contextualised Word Embedding Dimensions Responsible\n for Contextual and Temporal Semantic Changes","summary":" Words change their meaning over time as well as in different contexts. The\nsense-aware contextualised word embeddings (SCWEs) such as the ones produced by\nXL-LEXEME by fine-tuning masked langauge models (MLMs) on Word-in-Context (WiC)\ndata attempt to encode such semantic changes of words within the contextualised\nword embedding (CWE) spaces. Despite the superior performance of SCWEs in\ncontextual/temporal semantic change detection (SCD) benchmarks, it remains\nunclear as to how the meaning changes are encoded in the embedding space. To\nstudy this, we compare pre-trained CWEs and their fine-tuned versions on\ncontextual and temporal semantic change benchmarks under Principal Component\nAnalysis (PCA) and Independent Component Analysis (ICA) transformations. Our\nexperimental results reveal several novel insights such as (a) although there\nexist a smaller number of axes that are responsible for semantic changes of\nwords in the pre-trained CWE space, this information gets distributed across\nall dimensions when fine-tuned, and (b) in contrast to prior work studying the\ngeometry of CWEs, we find that PCA to better represent semantic changes than\nICA. Source code is available at https://github.com/LivNLP/svp-dims .\n","authors":["Taichi Aida","Danushka Bollegala"],"pdf_url":"https://arxiv.org/pdf/2407.02820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02819v1","updated":"2024-07-03T05:40:41Z","published":"2024-07-03T05:40:41Z","title":"Efficient Training of Language Models with Compact and Consistent Next\n Token Distributions","summary":" Maximizing the likelihood of the next token is an established, statistically\nsound objective for pre-training language models. In this paper we show that we\ncan train better models faster by pre-aggregating the corpus with a collapsed\n$n$-gram distribution. Previous studies have proposed corpus-level $n$-gram\nstatistics as a regularizer; however, the construction and querying of such\n$n$-grams, if done naively, prove to be costly and significantly impede\ntraining speed, thereby limiting their application in modern large language\nmodel pre-training.\n We introduce an alternative compact representation of the next token\ndistribution that, in expectation, aligns with the complete $n$-gram\ndistribution while markedly reducing variance across mini-batches compared to\nthe standard next-token loss. Empirically, we demonstrate that both the\n$n$-gram regularized model and our approximation yield substantial improvements\nin model quality and convergence rate compared to existing methods.\nFurthermore, our approximation facilitates scalability of gains to larger\ndatasets and models compared to the straightforward $n$-gram regularization\nmethod.\n","authors":["Ashutosh Sathe","Sunita Sarawagi"],"pdf_url":"https://arxiv.org/pdf/2407.02819v1.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2405.15585v2","updated":"2024-07-03T05:26:24Z","published":"2024-05-24T14:13:54Z","title":"Synergizing In-context Learning with Hints for End-to-end Task-oriented\n Dialog Systems","summary":" End-to-end Task-Oriented Dialog (TOD) systems typically require extensive\ntraining datasets to perform well. In contrast, large language model (LLM)\nbased TOD systems can excel even with limited data due to their ability to\nlearn tasks through in-context exemplars. However, these models lack alignment\nwith the style of responses in training data and often generate comprehensive\nresponses, making it difficult for users to grasp the information quickly. In\nresponse, we propose SyncTOD that synergizes LLMs with task-specific hints to\nimprove alignment in low-data settings. SyncTOD employs small auxiliary models\nto provide hints and select exemplars for in-context prompts. With ChatGPT,\nSyncTOD achieves superior performance compared to LLM-based baselines and SoTA\nmodels in low-data settings, while retaining competitive performance in\nfull-data settings.\n","authors":["Vishal Vivek Saley","Rocktim Jyoti Das","Dinesh Raghu"," Mausam"],"pdf_url":"https://arxiv.org/pdf/2405.15585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15997v3","updated":"2024-07-03T05:21:02Z","published":"2023-12-26T11:01:36Z","title":"Aligning Large Language Models with Human Preferences through\n Representation Engineering","summary":" Aligning large language models (LLMs) with human preferences is crucial for\nenhancing their utility in terms of helpfulness, truthfulness, safety,\nharmlessness, and interestingness. Existing methods for achieving this\nalignment often involves employing reinforcement learning from human feedback\n(RLHF) to fine-tune LLMs based on human labels assessing the relative quality\nof model responses. Nevertheless, RLHF is susceptible to instability during\nfine-tuning and presents challenges in implementation.Drawing inspiration from\nthe emerging field of representation engineering (RepE), this study aims to\nidentify relevant representations for high-level human preferences embedded in\npatterns of activity within an LLM, and achieve precise control of model\nbehavior by transforming its representations. This novel approach, denoted as\nRepresentation Alignment from Human Feedback (RAHF), proves to be effective,\ncomputationally efficient, and easy to implement.Extensive experiments\ndemonstrate the efficacy of RAHF in not only capturing but also manipulating\nrepresentations to align with a broad spectrum of human preferences or values,\nrather than being confined to a singular concept or function (e.g. honesty or\nbias). RAHF's versatility in accommodating diverse human preferences shows its\npotential for advancing LLM performance.\n","authors":["Wenhao Liu","Xiaohua Wang","Muling Wu","Tianlong Li","Changze Lv","Zixuan Ling","Jianhao Zhu","Cenyuan Zhang","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2312.15997v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02814v1","updated":"2024-07-03T05:19:45Z","published":"2024-07-03T05:19:45Z","title":"Images Speak Louder than Words: Understanding and Mitigating Bias in\n Vision-Language Model from a Causal Mediation Perspective","summary":" Vision-language models (VLMs) pre-trained on extensive datasets can\ninadvertently learn biases by correlating gender information with specific\nobjects or scenarios. Current methods, which focus on modifying inputs and\nmonitoring changes in the model's output probability scores, often struggle to\ncomprehensively understand bias from the perspective of model components. We\npropose a framework that incorporates causal mediation analysis to measure and\nmap the pathways of bias generation and propagation within VLMs. This approach\nallows us to identify the direct effects of interventions on model bias and the\nindirect effects of interventions on bias mediated through different model\ncomponents. Our results show that image features are the primary contributors\nto bias, with significantly higher impacts than text features, specifically\naccounting for 32.57% and 12.63% of the bias in the MSCOCO and PASCAL-SENTENCE\ndatasets, respectively. Notably, the image encoder's contribution surpasses\nthat of the text encoder and the deep fusion encoder. Further experimentation\nconfirms that contributions from both language and vision modalities are\naligned and non-conflicting. Consequently, focusing on blurring gender\nrepresentations within the image encoder, which contributes most to the model\nbias, reduces bias efficiently by 22.03% and 9.04% in the MSCOCO and\nPASCAL-SENTENCE datasets, respectively, with minimal performance loss or\nincreased computational demands.\n","authors":["Zhaotian Weng","Zijun Gao","Jerone Andrews","Jieyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.02814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07437v2","updated":"2024-07-03T04:59:32Z","published":"2024-05-13T02:33:25Z","title":"Evaluation of Retrieval-Augmented Generation: A Survey","summary":" Retrieval-Augmented Generation (RAG) has recently gained traction in natural\nlanguage processing. Numerous studies and real-world applications are\nleveraging its ability to enhance generative models through external\ninformation retrieval. Evaluating these RAG systems, however, poses unique\nchallenges due to their hybrid structure and reliance on dynamic knowledge\nsources. To better understand these challenges, we conduct A Unified Evaluation\nProcess of RAG (Auepora) and aim to provide a comprehensive overview of the\nevaluation and benchmarks of RAG systems. Specifically, we examine and compare\nseveral quantifiable metrics of the Retrieval and Generation components, such\nas relevance, accuracy, and faithfulness, within the current RAG benchmarks,\nencompassing the possible output and ground truth pairs. We then analyze the\nvarious datasets and metrics, discuss the limitations of current benchmarks,\nand suggest potential directions to advance the field of RAG benchmarks.\n","authors":["Hao Yu","Aoran Gan","Kai Zhang","Shiwei Tong","Qi Liu","Zhaofeng Liu"],"pdf_url":"https://arxiv.org/pdf/2405.07437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03898v2","updated":"2024-07-03T04:57:41Z","published":"2024-02-06T11:10:35Z","title":"DistiLLM: Towards Streamlined Distillation for Large Language Models","summary":" Knowledge distillation (KD) is widely used for compressing a teacher model to\na smaller student model, reducing its inference cost and memory footprint while\npreserving model capabilities. However, current KD methods for auto-regressive\nsequence models (e.g., large language models) suffer from missing a\nstandardized objective function. Moreover, the recent use of student-generated\noutputs to address training-inference mismatches has significantly escalated\ncomputational costs. To tackle these issues, we introduce DistiLLM, a more\neffective and efficient KD framework for auto-regressive language models.\nDistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence\nloss, where we unveil and leverage its theoretical properties, and (2) an\nadaptive off-policy approach designed to enhance the efficiency in utilizing\nstudent-generated outputs. Extensive experiments, including\ninstruction-following tasks, demonstrate the effectiveness of DistiLLM in\nbuilding high-performing student models while achieving up to 4.3$\\times$\nspeedup compared to recent KD methods.\n","authors":["Jongwoo Ko","Sungnyun Kim","Tianyi Chen","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2402.03898v2.pdf","comment":"ICML 2024; Code is available at https://github.com/jongwooko/distillm"},{"id":"http://arxiv.org/abs/2402.10671v3","updated":"2024-07-03T04:45:29Z","published":"2024-02-16T13:24:05Z","title":"Decomposition for Enhancing Attention: Improving LLM-based Text-to-SQL\n through Workflow Paradigm","summary":" In-context learning of large-language models (LLMs) has achieved remarkable\nsuccess in the field of natural language processing, while extensive case\nstudies reveal that the single-step chain-of-thought prompting approach faces\nchallenges such as attention diffusion and inadequate performance in complex\ntasks like text-to-SQL. To improve the contextual learning capabilities of LLMs\nin text-to-SQL, a workflow paradigm method is proposed, aiming to enhance the\nattention and problem-solving scope of LLMs through decomposition.\nSpecifically, the information determination module for eliminating redundant\ninformation and the brand-new prompt structure based on problem classification\ngreatly enhance the model's attention. Additionally, the inclusion of\nself-correction and active learning modules greatly expands the problem-solving\nscope of LLMs, hence improving the upper limit of LLM-based approaches.\nExtensive experiments conducted on three datasets demonstrate that our approach\noutperforms other methods by a significant margin. About 2-3 percentage point\nimprovements compared to the existing baseline on the Spider Dev,\nSpider-Realistic, and Bird Dev datasets and new SOTA results on the Spider Test\ndataset are achieved. Our code is available on GitHub:\n\\url{https://github.com/FlyingFeather/DEA-SQL}.\n","authors":["Yuanzhen Xie","Xinzhou Jin","Tao Xie","MingXiong Lin","Liang Chen","Chenyun Yu","Lei Cheng","ChengXiang Zhuo","Bo Hu","Zang Li"],"pdf_url":"https://arxiv.org/pdf/2402.10671v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02762v2","updated":"2024-07-03T04:34:03Z","published":"2023-07-06T04:05:44Z","title":"PRD: Peer Rank and Discussion Improve Large Language Model based\n Evaluations","summary":" Nowadays, the quality of responses generated by different modern large\nlanguage models (LLMs) is hard to evaluate and compare automatically. Recent\nstudies suggest and predominantly use LLMs for reference-free evaluation of\nopen-ended question answering. More specifically, they use the recognized\n\"strongest\" LLM as the evaluator, which conducts pairwise comparisons of\ncandidate models' answers and provides a ranking score. However, this intuitive\nmethod has multiple problems, such as bringing in self-enhancement (favoring\nits own answers) and positional bias. We draw insights and lessons from the\neducational domain (Cho & MacArthur, 2011; Walsh, 2014) to improve LLM-based\nevaluations. Specifically, we propose (1) the peer rank (PR) algorithm that\ntakes into account each peer LLM's pairwise preferences of all answer pairs,\nand outputs a final ranking of models; and (2) peer discussion (PD), where we\nprompt two LLMs to discuss and try to reach a mutual agreement on the\npreferences of two answers. We conduct experiments on two benchmark datasets.\nWe find that our approaches achieve higher accuracy and align better with human\njudgments. Interestingly, PR can induce a relatively accurate self-ranking of\nmodels under the anonymous setting, where each model's name is unrevealed. Our\nwork provides space to explore evaluating models that are hard to compare for\nhumans.\n","authors":["Ruosen Li","Teerth Patel","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2307.02762v2.pdf","comment":"Accepted by TMLR"},{"id":"http://arxiv.org/abs/2403.16437v2","updated":"2024-07-03T04:10:43Z","published":"2024-03-25T05:37:16Z","title":"Reasoning Runtime Behavior of a Program with LLM: How Far Are We?","summary":" Large language models for code (i.e., code LLMs) have shown strong code\nunderstanding and generation capabilities. To evaluate the capabilities of code\nLLMs in various aspects, many benchmarks have been proposed (e.g., HumanEval\nand ClassEval). Code reasoning is one of the most essential abilities of code\nLLMs, but existing benchmarks for code reasoning are not sufficient. Typically,\nthey focus on predicting the input and output of a program, ignoring the\nevaluation of the intermediate behavior during program execution, as well as\nthe logical consistency (e.g., the model should not give the correct output if\nthe prediction of execution path is wrong) when performing the reasoning. To\naddress these problems, in this paper, we propose a framework, namely REval,\nfor evaluating code reasoning abilities and consistency of code LLMs with\nprogram execution. We utilize existing code benchmarks and adapt them to new\nbenchmarks within our framework. A large-scale empirical study is conducted and\nmost LLMs show unsatisfactory performance on both Runtime Behavior Reasoning\n(i.e., an average accuracy of 44.4%) and Incremental Consistency Evaluation\n(i.e., an average IC score of 10.3). Evaluation results of current code LLMs\nreflect the urgent need for the community to strengthen the code reasoning\ncapability of code LLMs. Our code, data, and \\newname leaderboard are available\nat https://r-eval.github.io.\n","authors":["Junkai Chen","Zhiyuan Pan","Xing Hu","Zhenhao Li","Ge Li","Xin Xia"],"pdf_url":"https://arxiv.org/pdf/2403.16437v2.pdf","comment":"Accepted by ICSE 2025 and this is our preprint version. Our REval\n leaderboard is available at https://r-eval.github.io"},{"id":"http://arxiv.org/abs/2406.16851v2","updated":"2024-07-03T03:55:59Z","published":"2024-06-24T17:58:03Z","title":"Losing Visual Needles in Image Haystacks: Vision Language Models are\n Easily Distracted in Short and Long Contexts","summary":" We present LoCoVQA, a dynamic benchmark generator for evaluating long-context\nextractive reasoning in vision language models (VLMs). LoCoVQA augments test\nexamples for mathematical reasoning, VQA, and character recognition tasks with\nincreasingly long visual contexts composed of both in-distribution and\nout-of-distribution distractor images.\n Across these tasks, a diverse set of VLMs rapidly lose performance as the\nvisual context length grows, often exhibiting a striking logarithmic decay\ntrend. This test assesses how well VLMs can ignore irrelevant information when\nanswering queries -- a task that is quite easy for language models (LMs) in the\ntext domain -- demonstrating that current state-of-the-art VLMs lack this\nessential capability for many long-context applications.\n","authors":["Aditya Sharma","Michael Saxon","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2406.16851v2.pdf","comment":"Under review. Minor errata correction in revision"},{"id":"http://arxiv.org/abs/2401.14267v2","updated":"2024-07-03T03:41:07Z","published":"2024-01-25T16:01:49Z","title":"Transformers and Cortical Waves: Encoders for Pulling In Context Across\n Time","summary":" The capabilities of transformer networks such as ChatGPT and other Large\nLanguage Models (LLMs) have captured the world's attention. The crucial\ncomputational mechanism underlying their performance relies on transforming a\ncomplete input sequence - for example, all the words in a sentence - into a\nlong \"encoding vector\" that allows transformers to learn long-range temporal\ndependencies in naturalistic sequences. Specifically, \"self-attention\" applied\nto this encoding vector enhances temporal context in transformers by computing\nassociations between pairs of words in the input sequence. We suggest that\nwaves of neural activity traveling across single cortical areas or multiple\nregions at the whole-brain scale could implement a similar encoding principle.\nBy encapsulating recent input history into a single spatial pattern at each\nmoment in time, cortical waves may enable temporal context to be extracted from\nsequences of sensory inputs, the same computational principle used in\ntransformers.\n","authors":["Lyle Muller","Patricia S. Churchland","Terrence J. Sejnowski"],"pdf_url":"https://arxiv.org/pdf/2401.14267v2.pdf","comment":"25 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.01046v2","updated":"2024-07-03T03:37:53Z","published":"2024-07-01T07:56:14Z","title":"FRoG: Evaluating Fuzzy Reasoning of Generalized Quantifiers in Large\n Language Models","summary":" Fuzzy reasoning is vital due to the frequent use of imprecise information in\ndaily contexts. However, the ability of current large language models (LLMs) to\nhandle such reasoning remains largely uncharted. In this paper, we introduce a\nnew benchmark, FRoG, for fuzzy reasoning, featuring real-world mathematical\nword problems that incorporate generalized quantifiers. Our experimental\nfindings reveal that fuzzy reasoning continues to pose significant challenges\nfor LLMs. Moreover, we find that existing methods designed to enhance reasoning\ndo not consistently improve performance in tasks involving fuzzy logic.\nAdditionally, our results show an inverse scaling effect in the performance of\nLLMs on FRoG. Interestingly, we also demonstrate that strong mathematical\nreasoning skills are not necessarily indicative of success on our benchmark.\n","authors":["Yiyuan Li","Shichao Sun","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2407.01046v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.02783v1","updated":"2024-07-03T03:21:02Z","published":"2024-07-03T03:21:02Z","title":"52B to 1T: Lessons Learned via Tele-FLM Series","summary":" Large Language Models (LLMs) represent a significant stride toward Artificial\nGeneral Intelligence. As scaling laws underscore the potential of increasing\nmodel sizes, the academic community has intensified its investigations into\nLLMs with capacities exceeding 50 billion parameters. This technical report\nbuilds on our prior work with Tele-FLM (also known as FLM-2), a publicly\navailable 52-billion-parameter model. We delve into two primary areas: we first\ndiscuss our observation of Supervised Fine-tuning (SFT) on Tele-FLM-52B, which\nsupports the \"less is more\" approach for SFT data construction; second, we\ndemonstrate our experiments and analyses on the best practices for\nprogressively growing a model from 52 billion to 102 billion, and subsequently\nto 1 trillion parameters. We will open-source a 1T model checkpoint, namely\nTele-FLM-1T, to advance further training and research.\n","authors":["Xiang Li","Yiqun Yao","Xin Jiang","Xuezhi Fang","Chao Wang","Xinzhang Liu","Zihan Wang","Yu Zhao","Xin Wang","Yuyao Huang","Shuangyong Song","Yongxiang Li","Zheng Zhang","Bo Zhao","Aixin Sun","Yequan Wang","Zhongjiang He","Zhongyuan Wang","Xuelong Li","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2407.02783v1.pdf","comment":"For the Tele-FLM-52B tech report, see also 2404.16645"},{"id":"http://arxiv.org/abs/2407.02776v1","updated":"2024-07-03T03:06:37Z","published":"2024-07-03T03:06:37Z","title":"A Framework for Quantum Finite-State Languages with Density Mapping","summary":" A quantum finite-state automaton (QFA) is a theoretical model designed to\nsimulate the evolution of a quantum system with finite memory in response to\nsequential input strings. We define the language of a QFA as the set of strings\nthat lead the QFA to an accepting state when processed from its initial state.\nQFAs exemplify how quantum computing can achieve greater efficiency compared to\nclassical computing. While being one of the simplest quantum models, QFAs are\nstill notably challenging to construct from scratch due to the preliminary\nknowledge of quantum mechanics required for superimposing unitary constraints\non the automata. Furthermore, even when QFAs are correctly assembled, the\nlimitations of a current quantum computer may cause fluctuations in the\nsimulation results depending on how an assembled QFA is translated into a\nquantum circuit.\n We present a framework that provides a simple and intuitive way to build QFAs\nand maximize the simulation accuracy. Our framework relies on two methods:\nFirst, it offers a predefined construction for foundational types of QFAs that\nrecognize special languages MOD and EQU. They play a role of basic building\nblocks for more complex QFAs. In other words, one can obtain more complex QFAs\nfrom these foundational automata using standard language operations. Second, we\nimprove the simulation accuracy by converting these QFAs into quantum circuits\nsuch that the resulting circuits perform well on noisy quantum computers.\n Our framework is available at https://github.com/sybaik1/qfa-toolkit.\n","authors":["SeungYeop Baik","Sicheol Sung","Yo-Sub Han"],"pdf_url":"https://arxiv.org/pdf/2407.02776v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.02775v1","updated":"2024-07-03T03:03:30Z","published":"2024-07-03T03:03:30Z","title":"MLKD-BERT: Multi-level Knowledge Distillation for Pre-trained Language\n Models","summary":" Knowledge distillation is an effective technique for pre-trained language\nmodel compression. Although existing knowledge distillation methods perform\nwell for the most typical model BERT, they could be further improved in two\naspects: the relation-level knowledge could be further explored to improve\nmodel performance; and the setting of student attention head number could be\nmore flexible to decrease inference time. Therefore, we are motivated to\npropose a novel knowledge distillation method MLKD-BERT to distill multi-level\nknowledge in teacher-student framework. Extensive experiments on GLUE benchmark\nand extractive question answering tasks demonstrate that our method outperforms\nstate-of-the-art knowledge distillation methods on BERT. In addition, MLKD-BERT\ncan flexibly set student attention head number, allowing for substantial\ninference time decrease with little performance drop.\n","authors":["Ying Zhang","Ziheng Yang","Shufan Ji"],"pdf_url":"https://arxiv.org/pdf/2407.02775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02772v1","updated":"2024-07-03T03:01:43Z","published":"2024-07-03T03:01:43Z","title":"Automatic gradient descent with generalized Newton's method","summary":" We propose the generalized Newton's method (GeN) -- a Hessian-informed\napproach that applies to any optimizer such as SGD and Adam, and covers the\nNewton-Raphson method as a sub-case. Our method automatically and dynamically\nselects the learning rate that accelerates the convergence, without the\nintensive tuning of the learning rate scheduler. In practice, out method is\neasily implementable, since it only requires additional forward passes with\nalmost zero computational overhead (in terms of training time and memory cost),\nif the overhead is amortized over many iterations. We present extensive\nexperiments on language and vision tasks (e.g. GPT and ResNet) to showcase that\nGeN optimizers match the state-of-the-art performance, which was achieved with\ncarefully tuned learning rate schedulers. Code to be released at\n\\url{https://github.com/ShiyunXu/AutoGeN}.\n","authors":["Zhiqi Bu","Shiyun Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15623v3","updated":"2024-07-03T02:59:32Z","published":"2023-11-27T08:38:42Z","title":"Injecting linguistic knowledge into BERT for Dialogue State Tracking","summary":" Dialogue State Tracking (DST) models often employ intricate neural network\narchitectures, necessitating substantial training data, and their inference\nprocess lacks transparency. This paper proposes a method that extracts\nlinguistic knowledge via an unsupervised framework and subsequently utilizes\nthis knowledge to augment BERT's performance and interpretability in DST tasks.\nThe knowledge extraction procedure is computationally economical and does not\nrequire annotations or additional training data. The injection of the extracted\nknowledge can be achieved by the addition of simple neural modules. We employ\nthe Convex Polytopic Model (CPM) as a feature extraction tool for DST tasks and\nillustrate that the acquired features correlate with syntactic and semantic\npatterns in the dialogues. This correlation facilitates a comprehensive\nunderstanding of the linguistic features influencing the DST model's\ndecision-making process. We benchmark this framework on various DST tasks and\nobserve a notable improvement in accuracy.\n","authors":["Xiaohan Feng","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2311.15623v3.pdf","comment":"Accepted for publication at IEEE Access"},{"id":"http://arxiv.org/abs/2401.09967v3","updated":"2024-07-03T02:55:37Z","published":"2024-01-18T13:31:24Z","title":"Sketch-Guided Constrained Decoding for Boosting Blackbox Large Language\n Models without Logit Access","summary":" Constrained decoding, a technique for enforcing constraints on language model\noutputs, offers a way to control text generation without retraining or\narchitectural modifications. Its application is, however, typically restricted\nto models that give users access to next-token distributions (usually via\nsoftmax logits), which poses a limitation with blackbox large language models\n(LLMs). This paper introduces sketch-guided constrained decoding (SGCD), a\nnovel approach to constrained decoding for blackbox LLMs, which operates\nwithout access to the logits of the blackbox LLM. SGCD utilizes a locally\nhosted auxiliary model to refine the output of an unconstrained blackbox LLM,\neffectively treating this initial output as a \"sketch\" for further elaboration.\nThis approach is complementary to traditional logit-based techniques and\nenables the application of constrained decoding in settings where full model\ntransparency is unavailable. We demonstrate the efficacy of SGCD through\nexperiments in closed information extraction and constituency parsing, showing\nhow it enhances the utility and flexibility of blackbox LLMs for complex NLP\ntasks.\n","authors":["Saibo Geng","Berkay Döner","Chris Wendler","Martin Josifoski","Robert West"],"pdf_url":"https://arxiv.org/pdf/2401.09967v3.pdf","comment":"Accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2407.01964v2","updated":"2024-07-03T02:25:23Z","published":"2024-07-02T05:43:15Z","title":"Enabling Discriminative Reasoning in LLMs for Legal Judgment Prediction","summary":" Legal judgment prediction is essential for enhancing judicial efficiency. In\nthis work, we identify that existing large language models (LLMs) underperform\nin this domain due to challenges in understanding case complexities and\ndistinguishing between similar charges. To adapt LLMs for effective legal\njudgment prediction, we introduce the Ask-Discriminate-Predict (ADAPT)\nreasoning framework inspired by human judicial reasoning. ADAPT involves\ndecomposing case facts, discriminating among potential charges, and predicting\nthe final judgment. We further enhance LLMs through fine-tuning with multi-task\nsynthetic trajectories to improve legal judgment prediction accuracy and\nefficiency under our ADAPT framework. Extensive experiments conducted on two\nwidely-used datasets demonstrate the superior performance of our framework in\nlegal judgment prediction, particularly when dealing with complex and confusing\ncharges.\n","authors":["Chenlong Deng","Kelong Mao","Yuyao Zhang","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2407.01964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02751v1","updated":"2024-07-03T01:56:00Z","published":"2024-07-03T01:56:00Z","title":"Emotion and Intent Joint Understanding in Multimodal Conversation: A\n Benchmarking Dataset","summary":" Emotion and Intent Joint Understanding in Multimodal Conversation (MC-EIU)\naims to decode the semantic information manifested in a multimodal\nconversational history, while inferring the emotions and intents simultaneously\nfor the current utterance. MC-EIU is enabling technology for many\nhuman-computer interfaces. However, there is a lack of available datasets in\nterms of annotation, modality, language diversity, and accessibility. In this\nwork, we propose an MC-EIU dataset, which features 7 emotion categories, 9\nintent categories, 3 modalities, i.e., textual, acoustic, and visual content,\nand two languages, i.e., English and Mandarin. Furthermore, it is completely\nopen-source for free access. To our knowledge, MC-EIU is the first\ncomprehensive and rich emotion and intent joint understanding dataset for\nmultimodal conversation. Together with the release of the dataset, we also\ndevelop an Emotion and Intent Interaction (EI$^2$) network as a reference\nsystem by modeling the deep correlation between emotion and intent in the\nmultimodal conversation. With comparative experiments and ablation studies, we\ndemonstrate the effectiveness of the proposed EI$^2$ method on the MC-EIU\ndataset. The dataset and codes will be made available at:\nhttps://github.com/MC-EIU/MC-EIU.\n","authors":["Rui Liu","Haolin Zuo","Zheng Lian","Xiaofen Xing","Björn W. Schuller","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2407.02751v1.pdf","comment":"26 pages, 8 figures, 12 tables, NeurIPS 2024 Dataset and Benchmark\n Track"},{"id":"http://arxiv.org/abs/2407.02750v1","updated":"2024-07-03T01:51:50Z","published":"2024-07-03T01:51:50Z","title":"Learning to Reduce: Towards Improving Performance of Large Language\n Models on Structured Data","summary":" Large Language Models (LLMs) have been achieving competent performance on a\nwide range of downstream tasks, yet existing work shows that inference on\nstructured data is challenging for LLMs. This is because LLMs need to either\nunderstand long structured data or select the most relevant evidence before\ninference, and both approaches are not trivial. This paper proposes a\nframework, Learning to Reduce, that fine-tunes a language model with On-Policy\nLearning to generate a reduced version of an input structured data. When\ncompared to state-of-the-art LLMs like GPT-4, Learning to Reduce not only\nachieves outstanding performance in reducing the input, but shows\ngeneralizability on different datasets. We further show that the model\nfine-tuned with our framework helps LLMs better perform on table QA tasks\nespecially when the context is longer.\n","authors":["Younghun Lee","Sungchul Kim","Ryan A. Rossi","Tong Yu","Xiang Chen"],"pdf_url":"https://arxiv.org/pdf/2407.02750v1.pdf","comment":"ICML 2024 Workshop on Long-Context Foundation Models, Vienna, Austria\n 2024. arXiv admin note: substantial text overlap with arXiv:2402.14195"},{"id":"http://arxiv.org/abs/2305.17401v4","updated":"2024-07-03T01:46:32Z","published":"2023-05-27T07:59:49Z","title":"A Framework For Refining Text Classification and Object Recognition from\n Academic Articles","summary":" With the widespread use of the internet, it has become increasingly crucial\nto extract specific information from vast amounts of academic articles\nefficiently. Data mining techniques are generally employed to solve this issue.\nHowever, data mining for academic articles is challenging since it requires\nautomatically extracting specific patterns in complex and unstructured layout\ndocuments. Current data mining methods for academic articles employ\nrule-based(RB) or machine learning(ML) approaches. However, using rule-based\nmethods incurs a high coding cost for complex typesetting articles. On the\nother hand, simply using machine learning methods requires annotation work for\ncomplex content types within the paper, which can be costly. Furthermore, only\nusing machine learning can lead to cases where patterns easily recognized by\nrule-based methods are mistakenly extracted. To overcome these issues, from the\nperspective of analyzing the standard layout and typesetting used in the\nspecified publication, we emphasize implementing specific methods for specific\ncharacteristics in academic articles. We have developed a novel Text Block\nRefinement Framework (TBRF), a machine learning and rule-based scheme hybrid.\nWe used the well-known ACL proceeding articles as experimental data for the\nvalidation experiment. The experiment shows that our approach achieved over 95%\nclassification accuracy and 90% detection accuracy for tables and figures.\n","authors":["Jinghong Li","Koichi Ota","Wen Gu","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2305.17401v4.pdf","comment":"This paper has been accepted at 'The International Symposium on\n Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)'"},{"id":"http://arxiv.org/abs/2402.04854v4","updated":"2024-07-03T01:34:52Z","published":"2024-02-07T13:54:06Z","title":"Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey","summary":" Research surveys have always posed a challenge for beginner researchers who\nlack of research training. These researchers struggle to understand the\ndirections within their research topic, and the discovery of new research\nfindings within a short time. One way to provide intuitive assistance to\nbeginner researchers is by offering relevant knowledge graphs(KG) and\nrecommending related academic papers. However, existing navigation knowledge\ngraphs primarily rely on keywords in the research field and often fail to\npresent the logical hierarchy among multiple related papers clearly. Moreover,\nmost recommendation systems for academic papers simply rely on high text\nsimilarity, which can leave researchers confused as to why a particular article\nis being recommended. They may lack of grasp important information about the\ninsight connection between \"Issue resolved\" and \"Issue finding\" that they hope\nto obtain. To address these issues, this study aims to support research insight\nsurveys for beginner researchers by establishing a hierarchical tree-structured\nknowledge graph that reflects the inheritance insight of research topics and\nthe relevance insight among the academic papers.\n","authors":["Jinghong Li","Huy Phan","Wen Gu","Koichi Ota","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2402.04854v4.pdf","comment":"This paper has been accepted by 'The 18TH International Conference on\n INnovations in Intelligent SysTems and Applications (INISTA 2024)'"},{"id":"http://arxiv.org/abs/2311.05661v3","updated":"2024-07-03T01:29:20Z","published":"2023-11-09T08:00:32Z","title":"Prompt Engineering a Prompt Engineer","summary":" Prompt engineering is a challenging yet crucial task for optimizing the\nperformance of large language models on customized tasks. It requires complex\nreasoning to examine the model's errors, hypothesize what is missing or\nmisleading in the current prompt, and communicate the task with clarity. While\nrecent works indicate that large language models can be meta-prompted to\nperform automatic prompt engineering, we argue that their potential is limited\ndue to insufficient guidance for complex reasoning in the meta-prompt. We fill\nthis gap by infusing into the meta-prompt three key components: detailed\ndescriptions, context specification, and a step-by-step reasoning template. The\nresulting method, named PE2, exhibits remarkable versatility across diverse\nlanguage tasks. It finds prompts that outperform \"let's think step by step\" by\n6.3% on MultiArith and 3.1% on GSM8K, and outperforms competitive baselines on\ncounterfactual tasks by 6.9%. Further, we show that PE2 can make targeted and\nhighly specific prompt edits, rectify erroneous prompts, and induce multi-step\nplans for complex tasks.\n","authors":["Qinyuan Ye","Maxamed Axmed","Reid Pryzant","Fereshte Khani"],"pdf_url":"https://arxiv.org/pdf/2311.05661v3.pdf","comment":"Accepted to ACL 2024 Findings. Camera-ready version"},{"id":"http://arxiv.org/abs/2407.02742v1","updated":"2024-07-03T01:28:51Z","published":"2024-07-03T01:28:51Z","title":"A Comparative Study of DSL Code Generation: Fine-Tuning vs. Optimized\n Retrieval Augmentation","summary":" Natural Language to Code Generation has made significant progress in recent\nyears with the advent of Large Language Models(LLMs). While generation for\ngeneral-purpose languages like C, C++, and Python has improved significantly,\nLLMs struggle with custom function names in Domain Specific Languages or DSLs.\nThis leads to higher hallucination rates and syntax errors, specially for DSLs\nhaving a high number of custom function names. Additionally, constant updates\nto function names add to the challenge as LLMs need to stay up-to-date. In this\npaper, we present optimizations for using Retrieval Augmented Generation (or\nRAG) with LLMs for DSL generation along with an ablation study comparing these\nstrategies. We generated a train as well as test dataset with a DSL to\nrepresent automation tasks across roughly 700 APIs in public domain. We used\nthe training dataset to fine-tune a Codex model for this DSL. Our results\nshowed that the fine-tuned model scored the best on code similarity metric.\nWith our RAG optimizations, we achieved parity for similarity metric. The\ncompilation rate, however, showed that both the models still got the syntax\nwrong many times, with RAG-based method being 2 pts better. Conversely,\nhallucination rate for RAG model lagged by 1 pt for API names and by 2 pts for\nAPI parameter keys. We conclude that an optimized RAG model can match the\nquality of fine-tuned models and offer advantages for new, unseen APIs.\n","authors":["Nastaran Bassamzadeh","Chhaya Methani"],"pdf_url":"https://arxiv.org/pdf/2407.02742v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2407.02736v1","updated":"2024-07-03T01:19:38Z","published":"2024-07-03T01:19:38Z","title":"MentalAgora: A Gateway to Advanced Personalized Care in Mental Health\n through Multi-Agent Debating and Attribute Control","summary":" As mental health issues globally escalate, there is a tremendous need for\nadvanced digital support systems. We introduce MentalAgora, a novel framework\nemploying large language models enhanced by interaction between multiple agents\nfor tailored mental health support. This framework operates through three\nstages: strategic debating, tailored counselor creation, and response\ngeneration, enabling the dynamic customization of responses based on individual\nuser preferences and therapeutic needs. We conduct experiments utilizing a\nhigh-quality evaluation dataset TherapyTalk crafted with mental health\nprofessionals, shwoing that MentalAgora generates expert-aligned and user\npreference-enhanced responses. Our evaluations, including experiments and user\nstudies, demonstrate that MentalAgora aligns with professional standards and\neffectively meets user preferences, setting a new benchmark for digital mental\nhealth interventions.\n","authors":["Yeonji Lee","Sangjun Park","Kyunghyun Cho","JinYeong Bak"],"pdf_url":"https://arxiv.org/pdf/2407.02736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.12420v2","updated":"2024-07-03T00:49:28Z","published":"2022-05-25T00:19:59Z","title":"Learning Action Conditions from Instructional Manuals for Instruction\n Understanding","summary":" The ability to infer pre- and postconditions of an action is vital for\ncomprehending complex instructions, and is essential for applications such as\nautonomous instruction-guided agents and assistive AI that supports humans to\nperform physical tasks. In this work, we propose a task dubbed action condition\ninference, and collecting a high-quality, human annotated dataset of\npreconditions and postconditions of actions in instructional manuals. We\npropose a weakly supervised approach to automatically construct large-scale\ntraining instances from online instructional manuals, and curate a densely\nhuman-annotated and validated dataset to study how well the current NLP models\ncan infer action-condition dependencies in the instruction texts. We design two\ntypes of models differ by whether contextualized and global information is\nleveraged, as well as various combinations of heuristics to construct the weak\nsupervisions. Our experimental results show a >20% F1-score improvement with\nconsidering the entire instruction contexts and a >6% F1-score benefit with the\nproposed heuristics.\n","authors":["Te-Lin Wu","Caiqi Zhang","Qingyuan Hu","Alex Spangher","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2205.12420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02723v1","updated":"2024-07-03T00:32:28Z","published":"2024-07-03T00:32:28Z","title":"e-Health CSIRO at \"Discharge Me!\" 2024: Generating Discharge Summary\n Sections with Fine-tuned Language Models","summary":" Clinical documentation is an important aspect of clinicians' daily work and\noften demands a significant amount of time. The BioNLP 2024 Shared Task on\nStreamlining Discharge Documentation (Discharge Me!) aims to alleviate this\ndocumentation burden by automatically generating discharge summary sections,\nincluding brief hospital course and discharge instruction, which are often\ntime-consuming to synthesize and write manually. We approach the generation\ntask by fine-tuning multiple open-sourced language models (LMs), including both\ndecoder-only and encoder-decoder LMs, with various configurations on input\ncontext. We also examine different setups for decoding algorithms, model\nensembling or merging, and model specialization. Our results show that\nconditioning on the content of discharge summary prior to the target sections\nis effective for the generation task. Furthermore, we find that smaller\nencoder-decoder LMs can work as well or even slightly better than larger\ndecoder based LMs fine-tuned through LoRA. The model checkpoints from our team\n(aehrc) are openly available.\n","authors":["Jinghui Liu","Aaron Nicolson","Jason Dowling","Bevan Koopman","Anthony Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.02723v1.pdf","comment":"BioNLP @ ACL 2024"},{"id":"http://arxiv.org/abs/2407.02719v1","updated":"2024-07-03T00:00:21Z","published":"2024-07-03T00:00:21Z","title":"Boosting Biomedical Concept Extraction by Rule-Based Data Augmentation","summary":" Document-level biomedical concept extraction is the task of identifying\nbiomedical concepts mentioned in a given document. Recent advancements have\nadapted pre-trained language models for this task. However, the scarcity of\ndomain-specific data and the deviation of concepts from their canonical names\noften hinder these models' effectiveness. To tackle this issue, we employ\nMetaMapLite, an existing rule-based concept mapping system, to generate\nadditional pseudo-annotated data from PubMed and PMC. The annotated data are\nused to augment the limited training data. Through extensive experiments, this\nstudy demonstrates the utility of a manually crafted concept mapping tool for\ntraining a better concept extraction model.\n","authors":["Qiwei Shao","Fengran Mo","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2407.02719v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.03320v1","updated":"2024-07-03T17:59:21Z","published":"2024-07-03T17:59:21Z","title":"InternLM-XComposer-2.5: A Versatile Large Vision Language Model\n Supporting Long-Contextual Input and Output","summary":" We present InternLM-XComposer-2.5 (IXC-2.5), a versatile large-vision\nlanguage model that supports long-contextual input and output. IXC-2.5 excels\nin various text-image comprehension and composition applications, achieving\nGPT-4V level capabilities with merely 7B LLM backend. Trained with 24K\ninterleaved image-text contexts, it can seamlessly extend to 96K long contexts\nvia RoPE extrapolation. This long-context capability allows IXC-2.5 to excel in\ntasks requiring extensive input and output contexts. Compared to its previous\n2.0 version, InternLM-XComposer-2.5 features three major upgrades in\nvision-language comprehension: (1) Ultra-High Resolution Understanding, (2)\nFine-Grained Video Understanding, and (3) Multi-Turn Multi-Image Dialogue. In\naddition to comprehension, IXC-2.5 extends to two compelling applications using\nextra LoRA parameters for text-image composition: (1) Crafting Webpages and (2)\nComposing High-Quality Text-Image Articles. IXC-2.5 has been evaluated on 28\nbenchmarks, outperforming existing open-source state-of-the-art models on 16\nbenchmarks. It also surpasses or competes closely with GPT-4V and Gemini Pro on\n16 key tasks. The InternLM-XComposer-2.5 is publicly available at\nhttps://github.com/InternLM/InternLM-XComposer.\n","authors":["Pan Zhang","Xiaoyi Dong","Yuhang Zang","Yuhang Cao","Rui Qian","Lin Chen","Qipeng Guo","Haodong Duan","Bin Wang","Linke Ouyang","Songyang Zhang","Wenwei Zhang","Yining Li","Yang Gao","Peng Sun","Xinyue Zhang","Wei Li","Jingwen Li","Wenhai Wang","Hang Yan","Conghui He","Xingcheng Zhang","Kai Chen","Jifeng Dai","Yu Qiao","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03320v1.pdf","comment":"Technical Report. https://github.com/InternLM/InternLM-XComposer"},{"id":"http://arxiv.org/abs/2312.00589v2","updated":"2024-07-03T17:58:12Z","published":"2023-11-30T17:57:34Z","title":"Merlin:Empowering Multimodal LLMs with Foresight Minds","summary":" Humans possess the remarkable ability to foresee the future to a certain\nextent based on present observations, a skill we term as foresight minds.\nHowever, this capability remains largely under explored within existing\nMultimodal Large Language Models (MLLMs), hindering their capacity to learn the\nfundamental principles of how things operate and the intentions behind the\nobserved subjects. To address this issue, we introduce the integration of\nfuture modeling into the existing learning frameworks of MLLMs. By utilizing\nthe subject trajectory, a highly structured representation of a consecutive\nframe sequence, as a learning objective, we aim to bridge the gap between the\npast and the future. We propose two innovative methods to empower MLLMs with\nforesight minds, Foresight Pre-Training (FPT) and Foresight Instruction-Tuning\n(FIT), which are inspired by the modern learning paradigm of LLMs.\nSpecifically, FPT jointly training various tasks centered on trajectories,\nenabling MLLMs to learn how to attend and predict entire trajectories from a\ngiven initial observation. Then, FIT requires MLLMs to first predict\ntrajectories of related objects and then reason about potential future events\nbased on them. Aided by FPT and FIT, we build a novel and unified MLLM named\nMerlin that supports multi-images input and analysis about potential actions of\nmultiple objects for the future reasoning. Experimental results show Merlin\npowerful foresight minds with impressive performance on both future reasoning\nand visual comprehension tasks.\n","authors":["En Yu","Liang Zhao","Yana Wei","Jinrong Yang","Dongming Wu","Lingyu Kong","Haoran Wei","Tiancai Wang","Zheng Ge","Xiangyu Zhang","Wenbing Tao"],"pdf_url":"https://arxiv.org/pdf/2312.00589v2.pdf","comment":"Accepted by ECCV2024. Project page: https://ahnsun.github.io/merlin"},{"id":"http://arxiv.org/abs/2407.03314v1","updated":"2024-07-03T17:55:27Z","published":"2024-07-03T17:55:27Z","title":"BACON: Supercharge Your VLM with Bag-of-Concept Graph to Mitigate\n Hallucinations","summary":" This paper presents Bag-of-Concept Graph (BACON) to gift models with limited\nlinguistic abilities to taste the privilege of Vision Language Models (VLMs)\nand boost downstream tasks such as detection, visual question answering (VQA),\nand image generation. Since the visual scenes in physical worlds are structured\nwith complex relations between objects, BACON breaks down annotations into\nbasic minimum elements and presents them in a graph structure. Element-wise\nstyle enables easy understanding, and structural composition liberates\ndifficult locating. Careful prompt design births the BACON captions with the\nhelp of public-available VLMs and segmentation methods. In this way, we gather\na dataset with 100K annotated images, which endow VLMs with remarkable\ncapabilities, such as accurately generating BACON, transforming prompts into\nBACON format, envisioning scenarios in the style of BACONr, and dynamically\nmodifying elements within BACON through interactive dialogue and more. Wide\nrepresentative experiments, including detection, VQA, and image generation\ntasks, tell BACON as a lifeline to achieve previous out-of-reach tasks or excel\nin their current cutting-edge solutions.\n","authors":["Zhantao Yang","Ruili Feng","Keyu Yan","Huangji Wang","Zhicai Wang","Shangwen Zhu","Han Zhang","Jie Xiao","Pingyu Wu","Kai Zhu","Jixuan Chen","Chen-Wei Xie","Chaojie Mao","Yue Yang","Hongyang Zhang","Yu Liu","Fan Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.03314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03307v1","updated":"2024-07-03T17:49:31Z","published":"2024-07-03T17:49:31Z","title":"HoloHisto: End-to-end Gigapixel WSI Segmentation with 4K Resolution\n Sequential Tokenization","summary":" In digital pathology, the traditional method for deep learning-based image\nsegmentation typically involves a two-stage process: initially segmenting\nhigh-resolution whole slide images (WSI) into smaller patches (e.g., 256x256,\n512x512, 1024x1024) and subsequently reconstructing them to their original\nscale. This method often struggles to capture the complex details and vast\nscope of WSIs. In this paper, we propose the holistic histopathology\n(HoloHisto) segmentation method to achieve end-to-end segmentation on gigapixel\nWSIs, whose maximum resolution is above 80,000$\\times$70,000 pixels. HoloHisto\nfundamentally shifts the paradigm of WSI segmentation to an end-to-end learning\nfashion with 1) a large (4K) resolution base patch for elevated visual\ninformation inclusion and efficient processing, and 2) a novel sequential\ntokenization mechanism to properly model the contextual relationships and\nefficiently model the rich information from the 4K input. To our best\nknowledge, HoloHisto presents the first holistic approach for gigapixel\nresolution WSI segmentation, supporting direct I/O of complete WSI and their\ncorresponding gigapixel masks. Under the HoloHisto platform, we unveil a random\n4K sampler that transcends ultra-high resolution, delivering 31 and 10 times\nmore pixels than standard 2D and 3D patches, respectively, for advancing\ncomputational capabilities. To facilitate efficient 4K resolution dense\nprediction, we leverage sequential tokenization, utilizing a pre-trained image\ntokenizer to group image features into a discrete token grid. To assess the\nperformance, our team curated a new kidney pathology image segmentation (KPIs)\ndataset with WSI-level glomeruli segmentation from whole mouse kidneys. From\nthe results, HoloHisto-4K delivers remarkable performance gains over previous\nstate-of-the-art models.\n","authors":["Yucheng Tang","Yufan He","Vishwesh Nath","Pengfeig Guo","Ruining Deng","Tianyuan Yao","Quan Liu","Can Cui","Mengmeng Yin","Ziyue Xu","Holger Roth","Daguang Xu","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2407.03307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03305v1","updated":"2024-07-03T17:47:59Z","published":"2024-07-03T17:47:59Z","title":"Smart City Surveillance Unveiling Indian Person Attributes in Real Time","summary":" This project focuses on creating a smart surveillance system for Indian\ncities that can identify and analyze people's attributes in real time. Using\nadvanced technologies like artificial intelligence and machine learning, the\nsystem can recognize attributes such as upper body color, what the person is\nwearing, accessories they are wearing, headgear, etc., and analyze behavior\nthrough cameras installed around the city.\n","authors":["Shubham Kale","Shashank Sharma","Abhilash Khuntia"],"pdf_url":"https://arxiv.org/pdf/2407.03305v1.pdf","comment":"6 pages , 8 figure"},{"id":"http://arxiv.org/abs/2407.03300v1","updated":"2024-07-03T17:42:46Z","published":"2024-07-03T17:42:46Z","title":"DisCo-Diff: Enhancing Continuous Diffusion Models with Discrete Latents","summary":" Diffusion models (DMs) have revolutionized generative learning. They utilize\na diffusion process to encode data into a simple Gaussian distribution.\nHowever, encoding a complex, potentially multimodal data distribution into a\nsingle continuous Gaussian distribution arguably represents an unnecessarily\nchallenging learning problem. We propose Discrete-Continuous Latent Variable\nDiffusion Models (DisCo-Diff) to simplify this task by introducing\ncomplementary discrete latent variables. We augment DMs with learnable discrete\nlatents, inferred with an encoder, and train DM and encoder end-to-end.\nDisCo-Diff does not rely on pre-trained networks, making the framework\nuniversally applicable. The discrete latents significantly simplify learning\nthe DM's complex noise-to-data mapping by reducing the curvature of the DM's\ngenerative ODE. An additional autoregressive transformer models the\ndistribution of the discrete latents, a simple step because DisCo-Diff requires\nonly few discrete variables with small codebooks. We validate DisCo-Diff on toy\ndata, several image synthesis tasks as well as molecular docking, and find that\nintroducing discrete latents consistently improves model performance. For\nexample, DisCo-Diff achieves state-of-the-art FID scores on class-conditioned\nImageNet-64/128 datasets with ODE sampler.\n","authors":["Yilun Xu","Gabriele Corso","Tommi Jaakkola","Arash Vahdat","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2407.03300v1.pdf","comment":"project page: https://research.nvidia.com/labs/lpr/disco-diff"},{"id":"http://arxiv.org/abs/2303.15975v3","updated":"2024-07-03T17:36:19Z","published":"2023-03-28T13:47:16Z","title":"Large-scale Pre-trained Models are Surprisingly Strong in Incremental\n Novel Class Discovery","summary":" Discovering novel concepts in unlabelled datasets and in a continuous manner\nis an important desideratum of lifelong learners. In the literature such\nproblems have been partially addressed under very restricted settings, where\nnovel classes are learned by jointly accessing a related labelled set (e.g.,\nNCD) or by leveraging only a supervisedly pre-trained model (e.g., class-iNCD).\nIn this work we challenge the status quo in class-iNCD and propose a learning\nparadigm where class discovery occurs continuously and truly unsupervisedly,\nwithout needing any related labelled set. In detail, we propose to exploit the\nricher priors from strong self-supervised pre-trained models (PTM). To this\nend, we propose simple baselines, composed of a frozen PTM backbone and a\nlearnable linear classifier, that are not only simple to implement but also\nresilient under longer learning scenarios. We conduct extensive empirical\nevaluation on a multitude of benchmarks and show the effectiveness of our\nproposed baselines when compared with sophisticated state-of-the-art methods.\nThe code is open source.\n","authors":["Mingxuan Liu","Subhankar Roy","Zhun Zhong","Nicu Sebe","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2303.15975v3.pdf","comment":"Accepted as a conference paper to ICPR 2024"},{"id":"http://arxiv.org/abs/2407.03297v1","updated":"2024-07-03T17:34:55Z","published":"2024-07-03T17:34:55Z","title":"Improved Noise Schedule for Diffusion Training","summary":" Diffusion models have emerged as the de facto choice for generating visual\nsignals. However, training a single model to predict noise across various\nlevels poses significant challenges, necessitating numerous iterations and\nincurring significant computational costs. Various approaches, such as loss\nweighting strategy design and architectural refinements, have been introduced\nto expedite convergence. In this study, we propose a novel approach to design\nthe noise schedule for enhancing the training of diffusion models. Our key\ninsight is that the importance sampling of the logarithm of the Signal-to-Noise\nratio (logSNR), theoretically equivalent to a modified noise schedule, is\nparticularly beneficial for training efficiency when increasing the sample\nfrequency around $\\log \\text{SNR}=0$. We empirically demonstrate the\nsuperiority of our noise schedule over the standard cosine schedule.\nFurthermore, we highlight the advantages of our noise schedule design on the\nImageNet benchmark, showing that the designed schedule consistently benefits\ndifferent prediction targets.\n","authors":["Tiankai Hang","Shuyang Gu"],"pdf_url":"https://arxiv.org/pdf/2407.03297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15232v2","updated":"2024-07-03T17:30:53Z","published":"2024-05-24T05:46:04Z","title":"DEEM: Diffusion Models Serve as the Eyes of Large Language Models for\n Image Perception","summary":" The development of large language models (LLMs) has significantly advanced\nthe emergence of large multimodal models (LMMs). While LMMs have achieved\ntremendous success by promoting the synergy between multimodal comprehension\nand creation, they often face challenges when confronted with\nout-of-distribution data. This is primarily due to their reliance on image\nencoders trained to encode images into task-relevant features, which may lead\nthem to disregard irrelevant details. Delving into the modeling capabilities of\ndiffusion models for images naturally prompts the question: Can diffusion\nmodels serve as the eyes of large language models for image perception? In this\npaper, we propose DEEM, a simple and effective approach that utilizes the\ngenerative feedback of diffusion models to align the semantic distributions of\nthe image encoder. This addresses the drawbacks of previous methods that solely\nrelied on image encoders like ViT, thereby enhancing the model's resilience\nagainst out-of-distribution samples and reducing visual hallucinations.\nImportantly, this is achieved without requiring additional training modules and\nwith fewer training parameters. We extensively evaluated DEEM on both our newly\nconstructed RobustVQA benchmark and another well-known benchmark, POPE, for\nobject hallucination. Compared to the state-of-the-art interleaved content\ngeneration models, DEEM exhibits enhanced robustness and a superior capacity to\nalleviate model hallucinations while utilizing fewer trainable parameters, less\npre-training data (10%), and a smaller base model size.\n","authors":["Run Luo","Yunshui Li","Longze Chen","Wanwei He","Ting-En Lin","Ziqiang Liu","Lei Zhang","Zikai Song","Xiaobo Xia","Tongliang Liu","Min Yang","Binyuan Hui"],"pdf_url":"https://arxiv.org/pdf/2405.15232v2.pdf","comment":"25 pages. arXiv admin note: text overlap with arXiv:2401.10208 by\n other authors"},{"id":"http://arxiv.org/abs/2407.03292v1","updated":"2024-07-03T17:26:07Z","published":"2024-07-03T17:26:07Z","title":"Biomechanics-informed Non-rigid Medical Image Registration and its\n Inverse Material Property Estimation with Linear and Nonlinear Elasticity","summary":" This paper investigates both biomechanical-constrained non-rigid medical\nimage registrations and accurate identifications of material properties for\nsoft tissues, using physics-informed neural networks (PINNs). The complex\nnonlinear elasticity theory is leveraged to formally establish the partial\ndifferential equations (PDEs) representing physics laws of biomechanical\nconstraints that need to be satisfied, with which registration and\nidentification tasks are treated as forward (i.e., data-driven solutions of\nPDEs) and inverse (i.e., parameter estimation) problems under PINNs\nrespectively. Two net configurations (i.e., Cfg1 and Cfg2) have also been\ncompared for both linear and nonlinear physics model. Two sets of experiments\nhave been conducted, using pairs of undeformed and deformed MR images from\nclinical cases of prostate cancer biopsy.\n Our contributions are summarised as follows. 1) We developed a learning-based\nbiomechanical-constrained non-rigid registration algorithm using PINNs, where\nlinear elasticity is generalised to the nonlinear version. 2) We demonstrated\nextensively that nonlinear elasticity shows no statistical significance against\nlinear models in computing point-wise displacement vectors but their respective\nbenefits may depend on specific patients, with finite-element (FE) computed\nground-truth. 3) We formulated and solved the inverse parameter estimation\nproblem, under the joint optimisation scheme of registration and parameter\nidentification using PINNs, whose solutions can be accurately found by locating\nsaddle points.\n","authors":["Zhe Min","Zachary M. C. Baum","Shaheer U. Saeed","Mark Emberton","Dean C. Barratt","Zeike A. Taylor","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2407.03292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03291v1","updated":"2024-07-03T17:24:36Z","published":"2024-07-03T17:24:36Z","title":"VCHAR:Variance-Driven Complex Human Activity Recognition framework with\n Generative Representation","summary":" Complex human activity recognition (CHAR) remains a pivotal challenge within\nubiquitous computing, especially in the context of smart environments. Existing\nstudies typically require meticulous labeling of both atomic and complex\nactivities, a task that is labor-intensive and prone to errors due to the\nscarcity and inaccuracies of available datasets. Most prior research has\nfocused on datasets that either precisely label atomic activities or, at\nminimum, their sequence approaches that are often impractical in real world\nsettings.In response, we introduce VCHAR (Variance-Driven Complex Human\nActivity Recognition), a novel framework that treats the outputs of atomic\nactivities as a distribution over specified intervals. Leveraging generative\nmethodologies, VCHAR elucidates the reasoning behind complex activity\nclassifications through video-based explanations, accessible to users without\nprior machine learning expertise. Our evaluation across three publicly\navailable datasets demonstrates that VCHAR enhances the accuracy of complex\nactivity recognition without necessitating precise temporal or sequential\nlabeling of atomic activities. Furthermore, user studies confirm that VCHAR's\nexplanations are more intelligible compared to existing methods, facilitating a\nbroader understanding of complex activity recognition among non-experts.\n","authors":["Yuan Sun","Navid Salami Pargoo","Taqiya Ehsan","Zhao Zhang Jorge Ortiz"],"pdf_url":"https://arxiv.org/pdf/2407.03291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04730v2","updated":"2024-07-03T17:04:14Z","published":"2024-01-09T18:59:49Z","title":"A Simple Baseline for Spoken Language to Sign Language Translation with\n 3D Avatars","summary":" The objective of this paper is to develop a functional system for translating\nspoken languages into sign languages, referred to as Spoken2Sign translation.\nThe Spoken2Sign task is orthogonal and complementary to traditional sign\nlanguage to spoken language (Sign2Spoken) translation. To enable Spoken2Sign\ntranslation, we present a simple baseline consisting of three steps: 1)\ncreating a gloss-video dictionary using existing Sign2Spoken benchmarks; 2)\nestimating a 3D sign for each sign video in the dictionary; 3) training a\nSpoken2Sign model, which is composed of a Text2Gloss translator, a sign\nconnector, and a rendering module, with the aid of the yielded gloss-3D sign\ndictionary. The translation results are then displayed through a sign avatar.\nAs far as we know, we are the first to present the Spoken2Sign task in an\noutput format of 3D signs. In addition to its capability of Spoken2Sign\ntranslation, we also demonstrate that two by-products of our approach-3D\nkeypoint augmentation and multi-view understanding-can assist in keypoint-based\nsign language understanding. Code and models are available at\nhttps://github.com/FangyunWei/SLRT.\n","authors":["Ronglai Zuo","Fangyun Wei","Zenggui Chen","Brian Mak","Jiaolong Yang","Xin Tong"],"pdf_url":"https://arxiv.org/pdf/2401.04730v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.03268v1","updated":"2024-07-03T16:57:38Z","published":"2024-07-03T16:57:38Z","title":"For a semiotic AI: Bridging computer vision and visual semiotics for\n computational observation of large scale facial image archives","summary":" Social networks are creating a digital world in which the cognitive,\nemotional, and pragmatic value of the imagery of human faces and bodies is\narguably changing. However, researchers in the digital humanities are often\nill-equipped to study these phenomena at scale. This work presents FRESCO (Face\nRepresentation in E-Societies through Computational Observation), a framework\ndesigned to explore the socio-cultural implications of images on social media\nplatforms at scale. FRESCO deconstructs images into numerical and categorical\nvariables using state-of-the-art computer vision techniques, aligning with the\nprinciples of visual semiotics. The framework analyzes images across three\nlevels: the plastic level, encompassing fundamental visual features like lines\nand colors; the figurative level, representing specific entities or concepts;\nand the enunciation level, which focuses particularly on constructing the point\nof view of the spectator and observer. These levels are analyzed to discern\ndeeper narrative layers within the imagery. Experimental validation confirms\nthe reliability and utility of FRESCO, and we assess its consistency and\nprecision across two public datasets. Subsequently, we introduce the FRESCO\nscore, a metric derived from the framework's output that serves as a reliable\nmeasure of similarity in image content.\n","authors":["Lia Morra","Antonio Santangelo","Pietro Basci","Luca Piano","Fabio Garcea","Fabrizio Lamberti","Massimo Leone"],"pdf_url":"https://arxiv.org/pdf/2407.03268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03263v1","updated":"2024-07-03T16:50:07Z","published":"2024-07-03T16:50:07Z","title":"A Unified Framework for 3D Scene Understanding","summary":" We propose UniSeg3D, a unified 3D segmentation framework that achieves\npanoptic, semantic, instance, interactive, referring, and open-vocabulary\nsemantic segmentation tasks within a single model. Most previous 3D\nsegmentation approaches are specialized for a specific task, thereby limiting\ntheir understanding of 3D scenes to a task-specific perspective. In contrast,\nthe proposed method unifies six tasks into unified representations processed by\nthe same Transformer. It facilitates inter-task knowledge sharing and,\ntherefore, promotes comprehensive 3D scene understanding. To take advantage of\nmulti-task unification, we enhance the performance by leveraging task\nconnections. Specifically, we design a knowledge distillation method and a\ncontrastive learning method to transfer task-specific knowledge across\ndifferent tasks. Benefiting from extensive inter-task knowledge sharing, our\nUniSeg3D becomes more powerful. Experiments on three benchmarks, including the\nScanNet20, ScanRefer, and ScanNet200, demonstrate that the UniSeg3D\nconsistently outperforms current SOTA methods, even those specialized for\nindividual tasks. We hope UniSeg3D can serve as a solid unified baseline and\ninspire future work. The code will be available at\nhttps://dk-liang.github.io/UniSeg3D/.\n","authors":["Wei Xu","Chunsheng Shi","Sifan Tu","Xin Zhou","Dingkang Liang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2407.03263v1.pdf","comment":"The code will be available at https://dk-liang.github.io/UniSeg3D/"},{"id":"http://arxiv.org/abs/2407.00983v2","updated":"2024-07-03T16:37:36Z","published":"2024-07-01T05:47:58Z","title":"FairMedFM: Fairness Benchmarking for Medical Imaging Foundation Models","summary":" The advent of foundation models (FMs) in healthcare offers unprecedented\nopportunities to enhance medical diagnostics through automated classification\nand segmentation tasks. However, these models also raise significant concerns\nabout their fairness, especially when applied to diverse and underrepresented\npopulations in healthcare applications. Currently, there is a lack of\ncomprehensive benchmarks, standardized pipelines, and easily adaptable\nlibraries to evaluate and understand the fairness performance of FMs in medical\nimaging, leading to considerable challenges in formulating and implementing\nsolutions that ensure equitable outcomes across diverse patient populations. To\nfill this gap, we introduce FairMedFM, a fairness benchmark for FM research in\nmedical imaging.FairMedFM integrates with 17 popular medical imaging datasets,\nencompassing different modalities, dimensionalities, and sensitive attributes.\nIt explores 20 widely used FMs, with various usages such as zero-shot learning,\nlinear probing, parameter-efficient fine-tuning, and prompting in various\ndownstream tasks -- classification and segmentation. Our exhaustive analysis\nevaluates the fairness performance over different evaluation metrics from\nmultiple perspectives, revealing the existence of bias, varied utility-fairness\ntrade-offs on different FMs, consistent disparities on the same datasets\nregardless FMs, and limited effectiveness of existing unfairness mitigation\nmethods. Checkout FairMedFM's project page and open-sourced codebase, which\nsupports extendible functionalities and applications as well as inclusive for\nstudies on FMs in medical imaging over the long term.\n","authors":["Ruinan Jin","Zikang Xu","Yuan Zhong","Qiongsong Yao","Qi Dou","S. Kevin Zhou","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2407.00983v2.pdf","comment":"29 pages, 17 figures"},{"id":"http://arxiv.org/abs/2407.03251v1","updated":"2024-07-03T16:33:31Z","published":"2024-07-03T16:33:31Z","title":"ACTRESS: Active Retraining for Semi-supervised Visual Grounding","summary":" Semi-Supervised Visual Grounding (SSVG) is a new challenge for its sparse\nlabeled data with the need for multimodel understanding. A previous study,\nRefTeacher, makes the first attempt to tackle this task by adopting the\nteacher-student framework to provide pseudo confidence supervision and\nattention-based supervision. However, this approach is incompatible with\ncurrent state-of-the-art visual grounding models, which follow the\nTransformer-based pipeline. These pipelines directly regress results without\nregion proposals or foreground binary classification, rendering them unsuitable\nfor fitting in RefTeacher due to the absence of confidence scores. Furthermore,\nthe geometric difference in teacher and student inputs, stemming from different\ndata augmentations, induces natural misalignment in attention-based\nconstraints. To establish a compatible SSVG framework, our paper proposes the\nACTive REtraining approach for Semi-Supervised Visual Grounding, abbreviated as\nACTRESS. Initially, the model is enhanced by incorporating an additional\nquantized detection head to expose its detection confidence. Building upon\nthis, ACTRESS consists of an active sampling strategy and a selective\nretraining strategy. The active sampling strategy iteratively selects\nhigh-quality pseudo labels by evaluating three crucial aspects: Faithfulness,\nRobustness, and Confidence, optimizing the utilization of unlabeled data. The\nselective retraining strategy retrains the model with periodic\nre-initialization of specific parameters, facilitating the model's escape from\nlocal minima. Extensive experiments demonstrates our superior performance on\nwidely-used benchmark datasets.\n","authors":["Weitai Kang","Mengxue Qu","Yunchao Wei","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2407.03251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03243v1","updated":"2024-07-03T16:14:09Z","published":"2024-07-03T16:14:09Z","title":"Visual Grounding with Attention-Driven Constraint Balancing","summary":" Unlike Object Detection, Visual Grounding task necessitates the detection of\nan object described by complex free-form language. To simultaneously model such\ncomplex semantic and visual representations, recent state-of-the-art studies\nadopt transformer-based models to fuse features from both modalities, further\nintroducing various modules that modulate visual features to align with the\nlanguage expressions and eliminate the irrelevant redundant information.\nHowever, their loss function, still adopting common Object Detection losses,\nsolely governs the bounding box regression output, failing to fully optimize\nfor the above objectives. To tackle this problem, in this paper, we first\nanalyze the attention mechanisms of transformer-based models. Building upon\nthis, we further propose a novel framework named Attention-Driven Constraint\nBalancing (AttBalance) to optimize the behavior of visual features within\nlanguage-relevant regions. Extensive experimental results show that our method\nbrings impressive improvements. Specifically, we achieve constant improvements\nover five different models evaluated on four different benchmarks. Moreover, we\nattain a new state-of-the-art performance by integrating our method into QRNet.\n","authors":["Weitai Kang","Luowei Zhou","Junyi Wu","Changchang Sun","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2407.03243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03240v1","updated":"2024-07-03T16:10:19Z","published":"2024-07-03T16:10:19Z","title":"Cyclic Refiner: Object-Aware Temporal Representation Learning for\n Multi-View 3D Detection and Tracking","summary":" We propose a unified object-aware temporal learning framework for multi-view\n3D detection and tracking tasks. Having observed that the efficacy of the\ntemporal fusion strategy in recent multi-view perception methods may be\nweakened by distractors and background clutters in historical frames, we\npropose a cyclic learning mechanism to improve the robustness of multi-view\nrepresentation learning. The essence is constructing a backward bridge to\npropagate information from model predictions (e.g., object locations and sizes)\nto image and BEV features, which forms a circle with regular inference. After\nbackward refinement, the responses of target-irrelevant regions in historical\nframes would be suppressed, decreasing the risk of polluting future frames and\nimproving the object awareness ability of temporal fusion. We further tailor an\nobject-aware association strategy for tracking based on the cyclic learning\nmodel. The cyclic learning model not only provides refined features, but also\ndelivers finer clues (e.g., scale level) for tracklet association. The proposed\ncycle learning method and association module together contribute a novel and\nunified multi-task framework. Experiments on nuScenes show that the proposed\nmodel achieves consistent performance gains over baselines of different designs\n(i.e., dense query-based BEVFormer, sparse query-based SparseBEV and LSS-based\nBEVDet4D) on both detection and tracking evaluation.\n","authors":["Mingzhe Guo","Zhipeng Zhang","Liping Jing","Yuan He","Ke Wang","Heng Fan"],"pdf_url":"https://arxiv.org/pdf/2407.03240v1.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2407.03239v1","updated":"2024-07-03T16:09:59Z","published":"2024-07-03T16:09:59Z","title":"Solving the inverse problem of microscopy deconvolution with a residual\n Beylkin-Coifman-Rokhlin neural network","summary":" Optic deconvolution in light microscopy (LM) refers to recovering the object\ndetails from images, revealing the ground truth of samples. Traditional\nexplicit methods in LM rely on the point spread function (PSF) during image\nacquisition. Yet, these approaches often fall short due to inaccurate PSF\nmodels and noise artifacts, hampering the overall restoration quality. In this\npaper, we approached the optic deconvolution as an inverse problem. Motivated\nby the nonstandard-form compression scheme introduced by Beylkin, Coifman, and\nRokhlin (BCR), we proposed an innovative physics-informed neural network\nMulti-Stage Residual-BCR Net (m-rBCR) to approximate the optic deconvolution.\nWe validated the m-rBCR model on four microscopy datasets - two simulated\nmicroscopy datasets from ImageNet and BioSR, real dSTORM microscopy images, and\nreal widefield microscopy images. In contrast to the explicit deconvolution\nmethods (e.g. Richardson-Lucy) and other state-of-the-art NN models (U-Net,\nDDPM, CARE, DnCNN, ESRGAN, RCAN, Noise2Noise, MPRNet, and MIMO-U-Net), the\nm-rBCR model demonstrates superior performance to other candidates by PSNR and\nSSIM in two real microscopy datasets and the simulated BioSR dataset. In the\nsimulated ImageNet dataset, m-rBCR ranks the second-best place (right after\nMIMO-U-Net). With the backbone from the optical physics, m-rBCR exploits the\ntrainable parameters with better performances (from ~30 times fewer than the\nbenchmark MIMO-U-Net to ~210 times than ESRGAN). This enables m-rBCR to achieve\na shorter runtime (from ~3 times faster than MIMO-U-Net to ~300 times faster\nthan DDPM). To summarize, by leveraging physics constraints our model reduced\npotentially redundant parameters significantly in expertise-oriented NN\ncandidates and achieved high efficiency with superior performance.\n","authors":["Rui Li","Mikhail Kudryashev","Artur Yakimovich"],"pdf_url":"https://arxiv.org/pdf/2407.03239v1.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.03217v1","updated":"2024-07-03T15:45:48Z","published":"2024-07-03T15:45:48Z","title":"MHNet: Multi-view High-order Network for Diagnosing Neurodevelopmental\n Disorders Using Resting-state fMRI","summary":" Background: Deep learning models have shown promise in diagnosing\nneurodevelopmental disorders (NDD) like ASD and ADHD. However, many models\neither use graph neural networks (GNN) to construct single-level brain\nfunctional networks (BFNs) or employ spatial convolution filtering for local\ninformation extraction from rs-fMRI data, often neglecting high-order features\ncrucial for NDD classification. Methods: We introduce a Multi-view High-order\nNetwork (MHNet) to capture hierarchical and high-order features from multi-view\nBFNs derived from rs-fMRI data for NDD prediction. MHNet has two branches: the\nEuclidean Space Features Extraction (ESFE) module and the Non-Euclidean Space\nFeatures Extraction (Non-ESFE) module, followed by a Feature Fusion-based\nClassification (FFC) module for NDD identification. ESFE includes a Functional\nConnectivity Generation (FCG) module and a High-order Convolutional Neural\nNetwork (HCNN) module to extract local and high-order features from BFNs in\nEuclidean space. Non-ESFE comprises a Generic Internet-like Brain Hierarchical\nNetwork Generation (G-IBHN-G) module and a High-order Graph Neural Network\n(HGNN) module to capture topological and high-order features in non-Euclidean\nspace. Results: Experiments on three public datasets show that MHNet\noutperforms state-of-the-art methods using both AAL1 and Brainnetome Atlas\ntemplates. Extensive ablation studies confirm the superiority of MHNet and the\neffectiveness of using multi-view fMRI information and high-order features. Our\nstudy also offers atlas options for constructing more sophisticated\nhierarchical networks and explains the association between key brain regions\nand NDD. Conclusion: MHNet leverages multi-view feature learning from both\nEuclidean and non-Euclidean spaces, incorporating high-order information from\nBFNs to enhance NDD classification performance.\n","authors":["Yueyang Li","Weiming Zeng","Wenhao Dong","Luhui Cai","Lei Wang","Hongyu Chen","Hongjie Yan","Lingbin Bian","Nizhuan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03217v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2310.05873v5","updated":"2024-07-03T15:45:26Z","published":"2023-10-09T17:13:10Z","title":"Implicit Concept Removal of Diffusion Models","summary":" Text-to-image (T2I) diffusion models often inadvertently generate unwanted\nconcepts such as watermarks and unsafe images. These concepts, termed as the\n\"implicit concepts\", could be unintentionally learned during training and then\nbe generated uncontrollably during inference. Existing removal methods still\nstruggle to eliminate implicit concepts primarily due to their dependency on\nthe model's ability to recognize concepts it actually can not discern. To\naddress this, we utilize the intrinsic geometric characteristics of implicit\nconcepts and present the Geom-Erasing, a novel concept removal method based on\nthe geometric-driven control. Specifically, once an unwanted implicit concept\nis identified, we integrate the existence and geometric information of the\nconcept into the text prompts with the help of an accessible classifier or\ndetector model. Subsequently, the model is optimized to identify and\ndisentangle this information, which is then adopted as negative prompts during\ngeneration. Moreover, we introduce the Implicit Concept Dataset (ICD), a novel\nimage-text dataset imbued with three typical implicit concepts (i.e., QR codes,\nwatermarks, and text), reflecting real-life situations where implicit concepts\nare easily injected. Geom-Erasing effectively mitigates the generation of\nimplicit concepts, achieving the state-of-the-art results on the Inappropriate\nImage Prompts (I2P) and our challenging Implicit Concept Dataset (ICD)\nbenchmarks.\n","authors":["Zhili Liu","Kai Chen","Yifan Zhang","Jianhua Han","Lanqing Hong","Hang Xu","Zhenguo Li","Dit-Yan Yeung","James Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.05873v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03216v1","updated":"2024-07-03T15:43:54Z","published":"2024-07-03T15:43:54Z","title":"Learning Disentangled Representation in Object-Centric Models for Visual\n Dynamics Prediction via Transformers","summary":" Recent work has shown that object-centric representations can greatly help\nimprove the accuracy of learning dynamics while also bringing interpretability.\nIn this work, we take this idea one step further, ask the following question:\n\"can learning disentangled representation further improve the accuracy of\nvisual dynamics prediction in object-centric models?\" While there has been some\nattempt to learn such disentangled representations for the case of static\nimages \\citep{nsb}, to the best of our knowledge, ours is the first work which\ntries to do this in a general setting for video, without making any specific\nassumptions about the kind of attributes that an object might have. The key\nbuilding block of our architecture is the notion of a {\\em block}, where\nseveral blocks together constitute an object. Each block is represented as a\nlinear combination of a given number of learnable concept vectors, which is\niteratively refined during the learning process. The blocks in our model are\ndiscovered in an unsupervised manner, by attending over object masks, in a\nstyle similar to discovery of slots \\citep{slot_attention}, for learning a\ndense object-centric representation. We employ self-attention via transformers\nover the discovered blocks to predict the next state resulting in discovery of\nvisual dynamics. We perform a series of experiments on several benchmark 2-D,\nand 3-D datasets demonstrating that our architecture (1) can discover\nsemantically meaningful blocks (2) help improve accuracy of dynamics prediction\ncompared to SOTA object-centric models (3) perform significantly better in OOD\nsetting where the specific attribute combinations are not seen earlier during\ntraining. Our experiments highlight the importance discovery of disentangled\nrepresentation for visual dynamics prediction.\n","authors":["Sanket Gandhi"," Atul","Samanyu Mahajan","Vishal Sharma","Rushil Gupta","Arnab Kumar Mondal","Parag Singla"],"pdf_url":"https://arxiv.org/pdf/2407.03216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03205v1","updated":"2024-07-03T15:36:47Z","published":"2024-07-03T15:36:47Z","title":"Category-Aware Dynamic Label Assignment with High-Quality Oriented\n Proposal","summary":" Objects in aerial images are typically embedded in complex backgrounds and\nexhibit arbitrary orientations. When employing oriented bounding boxes (OBB) to\nrepresent arbitrary oriented objects, the periodicity of angles could lead to\ndiscontinuities in label regression values at the boundaries, inducing abrupt\nfluctuations in the loss function. To address this problem, an OBB\nrepresentation based on the complex plane is introduced in the oriented\ndetection framework, and a trigonometric loss function is proposed. Moreover,\nleveraging prior knowledge of complex background environments and significant\ndifferences in large objects in aerial images, a conformer RPN head is\nconstructed to predict angle information. The proposed loss function and\nconformer RPN head jointly generate high-quality oriented proposals. A\ncategory-aware dynamic label assignment based on predicted category feedback is\nproposed to address the limitations of solely relying on IoU for proposal label\nassignment. This method makes negative sample selection more representative,\nensuring consistency between classification and regression features.\nExperiments were conducted on four realistic oriented detection datasets, and\nthe results demonstrate superior performance in oriented object detection with\nminimal parameter tuning and time costs. Specifically, mean average precision\n(mAP) scores of 82.02%, 71.99%, 69.87%, and 98.77% were achieved on the\nDOTA-v1.0, DOTA-v1.5, DIOR-R, and HRSC2016 datasets, respectively.\n","authors":["Mingkui Feng","Hancheng Yu","Xiaoyu Dang","Ming Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.03205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03204v1","updated":"2024-07-03T15:36:27Z","published":"2024-07-03T15:36:27Z","title":"Expressive Gaussian Human Avatars from Monocular RGB Video","summary":" Nuanced expressiveness, particularly through fine-grained hand and facial\nexpressions, is pivotal for enhancing the realism and vitality of digital human\nrepresentations. In this work, we focus on investigating the expressiveness of\nhuman avatars when learned from monocular RGB video; a setting that introduces\nnew challenges in capturing and animating fine-grained details. To this end, we\nintroduce EVA, a drivable human model that meticulously sculpts fine details\nbased on 3D Gaussians and SMPL-X, an expressive parametric human model. Focused\non enhancing expressiveness, our work makes three key contributions. First, we\nhighlight the critical importance of aligning the SMPL-X model with RGB frames\nfor effective avatar learning. Recognizing the limitations of current SMPL-X\nprediction methods for in-the-wild videos, we introduce a plug-and-play module\nthat significantly ameliorates misalignment issues. Second, we propose a\ncontext-aware adaptive density control strategy, which is adaptively adjusting\nthe gradient thresholds to accommodate the varied granularity across body\nparts. Last but not least, we develop a feedback mechanism that predicts\nper-pixel confidence to better guide the learning of 3D Gaussians. Extensive\nexperiments on two benchmarks demonstrate the superiority of our framework both\nquantitatively and qualitatively, especially on the fine-grained hand and\nfacial details. See the project website at \\url{https://evahuman.github.io}\n","authors":["Hezhen Hu","Zhiwen Fan","Tianhao Wu","Yihan Xi","Seoyoung Lee","Georgios Pavlakos","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03200v1","updated":"2024-07-03T15:30:45Z","published":"2024-07-03T15:30:45Z","title":"SegVG: Transferring Object Bounding Box to Segmentation for Visual\n Grounding","summary":" Different from Object Detection, Visual Grounding deals with detecting a\nbounding box for each text-image pair. This one box for each text-image data\nprovides sparse supervision signals. Although previous works achieve impressive\nresults, their passive utilization of annotation, i.e. the sole use of the box\nannotation as regression ground truth, results in a suboptimal performance. In\nthis paper, we present SegVG, a novel method transfers the box-level annotation\nas Segmentation signals to provide an additional pixel-level supervision for\nVisual Grounding. Specifically, we propose the Multi-layer Multi-task\nEncoder-Decoder as the target grounding stage, where we learn a regression\nquery and multiple segmentation queries to ground the target by regression and\nsegmentation of the box in each decoding layer, respectively. This approach\nallows us to iteratively exploit the annotation as signals for both box-level\nregression and pixel-level segmentation. Moreover, as the backbones are\ntypically initialized by pretrained parameters learned from unimodal tasks and\nthe queries for both regression and segmentation are static learnable\nembeddings, a domain discrepancy remains among these three types of features,\nwhich impairs subsequent target grounding. To mitigate this discrepancy, we\nintroduce the Triple Alignment module, where the query, text, and vision tokens\nare triangularly updated to share the same space by triple attention mechanism.\nExtensive experiments on five widely used datasets validate our\nstate-of-the-art (SOTA) performance.\n","authors":["Weitai Kang","Gaowen Liu","Mubarak Shah","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2407.03200v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.03197v1","updated":"2024-07-03T15:29:10Z","published":"2024-07-03T15:29:10Z","title":"DyFADet: Dynamic Feature Aggregation for Temporal Action Detection","summary":" Recent proposed neural network-based Temporal Action Detection (TAD) models\nare inherently limited to extracting the discriminative representations and\nmodeling action instances with various lengths from complex scenes by\nshared-weights detection heads. Inspired by the successes in dynamic neural\nnetworks, in this paper, we build a novel dynamic feature aggregation (DFA)\nmodule that can simultaneously adapt kernel weights and receptive fields at\ndifferent timestamps. Based on DFA, the proposed dynamic encoder layer\naggregates the temporal features within the action time ranges and guarantees\nthe discriminability of the extracted representations. Moreover, using DFA\nhelps to develop a Dynamic TAD head (DyHead), which adaptively aggregates the\nmulti-scale features with adjusted parameters and learned receptive fields\nbetter to detect the action instances with diverse ranges from videos. With the\nproposed encoder layer and DyHead, a new dynamic TAD model, DyFADet, achieves\npromising performance on a series of challenging TAD benchmarks, including\nHACS-Segment, THUMOS14, ActivityNet-1.3, Epic-Kitchen 100, Ego4D-Moment\nQueriesV1.0, and FineAction. Code is released to\nhttps://github.com/yangle15/DyFADet-pytorch.\n","authors":["Le Yang","Ziwei Zheng","Yizeng Han","Hao Cheng","Shiji Song","Gao Huang","Fan Li"],"pdf_url":"https://arxiv.org/pdf/2407.03197v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2405.00630v3","updated":"2024-07-03T15:23:00Z","published":"2024-05-01T16:55:08Z","title":"Depth Priors in Removal Neural Radiance Fields","summary":" Neural Radiance Fields (NeRF) have achieved impressive results in 3D\nreconstruction and novel view generation. A significant challenge within NeRF\ninvolves editing reconstructed 3D scenes, such as object removal, which demands\nconsistency across multiple views and the synthesis of high-quality\nperspectives. Previous studies have integrated depth priors, typically sourced\nfrom LiDAR or sparse depth estimates from COLMAP, to enhance NeRF's performance\nin object removal. However, these methods are either expensive or\ntime-consuming. This paper proposes a new pipeline that leverages SpinNeRF and\nmonocular depth estimation models like ZoeDepth to enhance NeRF's performance\nin complex object removal with improved efficiency. A thorough evaluation of\nCOLMAP's dense depth reconstruction on the KITTI dataset is conducted to\ndemonstrate that COLMAP can be viewed as a cost-effective and scalable\nalternative for acquiring depth ground truth compared to traditional methods\nlike LiDAR. This serves as the basis for evaluating the performance of\nmonocular depth estimation models to determine the best one for generating\ndepth priors for SpinNeRF. The new pipeline is tested in various scenarios\ninvolving 3D reconstruction and object removal, and the results indicate that\nour pipeline significantly reduces the time required for the acquisition of\ndepth priors for object removal and enhances the fidelity of the synthesized\nviews, suggesting substantial potential for building high-fidelity digital twin\nsystems with increased efficiency in the future.\n","authors":["Zhihao Guo","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.00630v3.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2310.06737v2","updated":"2024-07-03T15:15:46Z","published":"2023-10-10T16:07:23Z","title":"Multi-domain improves out-of-distribution and data-limited scenarios for\n medical image analysis","summary":" Current machine learning methods for medical image analysis primarily focus\non developing models tailored for their specific tasks, utilizing data within\ntheir target domain. These specialized models tend to be data-hungry and often\nexhibit limitations in generalizing to out-of-distribution samples. In this\nwork, we show that employing models that incorporate multiple domains instead\nof specialized ones significantly alleviates the limitations observed in\nspecialized models. We refer to this approach as multi-domain model and compare\nits performance to that of specialized models. For this, we introduce the\nincorporation of diverse medical image domains, including different imaging\nmodalities like X-ray, MRI, CT, and ultrasound images, as well as various\nviewpoints such as axial, coronal, and sagittal views. Our findings underscore\nthe superior generalization capabilities of multi-domain models, particularly\nin scenarios characterized by limited data availability and\nout-of-distribution, frequently encountered in healthcare applications. The\nintegration of diverse data allows multi-domain models to utilize information\nacross domains, enhancing the overall outcomes substantially. To illustrate,\nfor organ recognition, multi-domain model can enhance accuracy by up to 8%\ncompared to conventional specialized models.\n","authors":["Ece Ozkan","Xavier Boix"],"pdf_url":"https://arxiv.org/pdf/2310.06737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18279v2","updated":"2024-07-03T14:59:57Z","published":"2024-04-28T18:51:32Z","title":"Out-of-distribution Detection in Medical Image Analysis: A survey","summary":" Computer-aided diagnostics has benefited from the development of deep\nlearning-based computer vision techniques in these years. Traditional\nsupervised deep learning methods assume that the test sample is drawn from the\nidentical distribution as the training data. However, it is possible to\nencounter out-of-distribution samples in real-world clinical scenarios, which\nmay cause silent failure in deep learning-based medical image analysis tasks.\nRecently, research has explored various out-of-distribution (OOD) detection\nsituations and techniques to enable a trustworthy medical AI system. In this\nsurvey, we systematically review the recent advances in OOD detection in\nmedical image analysis. We first explore several factors that may cause a\ndistributional shift when using a deep-learning-based model in clinic\nscenarios, with three different types of distributional shift well defined on\ntop of these factors. Then a framework is suggested to categorize and feature\nexisting solutions, while the previous studies are reviewed based on the\nmethodology taxonomy. Our discussion also includes evaluation protocols and\nmetrics, as well as the challenge and a research direction lack of exploration.\n","authors":["Zesheng Hong","Yubiao Yue","Yubin Chen","Lele Cong","Huanjie Lin","Yuanmei Luo","Mini Han Wang","Weidong Wang","Jialong Xu","Xiaoqi Yang","Hechang Chen","Zhenzhang Li","Sihong Xie"],"pdf_url":"https://arxiv.org/pdf/2404.18279v2.pdf","comment":"23 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.03179v1","updated":"2024-07-03T14:59:46Z","published":"2024-07-03T14:59:46Z","title":"Motion meets Attention: Video Motion Prompts","summary":" Videos contain rich spatio-temporal information. Traditional methods for\nextracting motion, used in tasks such as action recognition, often rely on\nvisual contents rather than precise motion features. This phenomenon is\nreferred to as 'blind motion extraction' behavior, which proves inefficient in\ncapturing motions of interest due to a lack of motion-guided cues. Recently,\nattention mechanisms have enhanced many computer vision tasks by effectively\nhighlighting salient visual areas. Inspired by this, we propose using a\nmodified Sigmoid function with learnable slope and shift parameters as an\nattention mechanism to activate and modulate motion signals derived from frame\ndifferencing maps. This approach generates a sequence of attention maps that\nenhance the processing of motion-related video content. To ensure temporally\ncontinuity and smoothness of the attention maps, we apply pair-wise temporal\nattention variation regularization to remove unwanted motions (e.g., noise)\nwhile preserving important ones. We then perform Hadamard product between each\npair of attention maps and the original video frames to highlight the evolving\nmotions of interest over time. These highlighted motions, termed video motion\nprompts, are subsequently used as inputs to the model instead of the original\nvideo frames. We formalize this process as a motion prompt layer and\nincorporate the regularization term into the loss function to learn better\nmotion prompts. This layer serves as an adapter between the model and the video\ndata, bridging the gap between traditional 'blind motion extraction' and the\nextraction of relevant motions of interest.\n","authors":["Qixiang Chen","Lei Wang","Piotr Koniusz","Tom Gedeon"],"pdf_url":"https://arxiv.org/pdf/2407.03179v1.pdf","comment":"Research report"},{"id":"http://arxiv.org/abs/2309.07087v2","updated":"2024-07-03T14:58:56Z","published":"2023-09-13T16:59:50Z","title":"Developing a Novel Image Marker to Predict the Clinical Outcome of\n Neoadjuvant Chemotherapy (NACT) for Ovarian Cancer Patients","summary":" Objective Neoadjuvant chemotherapy (NACT) is one kind of treatment for\nadvanced stage ovarian cancer patients. However, due to the nature of tumor\nheterogeneity, the clinical outcomes to NACT vary significantly among different\nsubgroups. Partial responses to NACT may lead to suboptimal debulking surgery,\nwhich will result in adverse prognosis. To address this clinical challenge, the\npurpose of this study is to develop a novel image marker to achieve high\naccuracy prognosis prediction of NACT at an early stage. Methods For this\npurpose, we first computed a total of 1373 radiomics features to quantify the\ntumor characteristics, which can be grouped into three categories: geometric,\nintensity, and texture features. Second, all these features were optimized by\nprincipal component analysis algorithm to generate a compact and informative\nfeature cluster. This cluster was used as input for developing and optimizing\nsupport vector machine (SVM) based classifiers, which indicated the likelihood\nof receiving suboptimal cytoreduction after the NACT treatment. Two different\nkernels for SVM algorithm were explored and compared. A total of 42 ovarian\ncancer cases were retrospectively collected to validate the scheme. A nested\nleave-one-out cross-validation framework was adopted for model performance\nassessment. Results The results demonstrated that the model with a Gaussian\nradial basis function kernel SVM yielded an AUC (area under the ROC [receiver\ncharacteristic operation] curve) of 0.806. Meanwhile, this model achieved\noverall accuracy (ACC) of 83.3%, positive predictive value (PPV) of 81.8%, and\nnegative predictive value (NPV) of 83.9%. Conclusion This study provides\nmeaningful information for the development of radiomics based image markers in\nNACT treatment outcome prediction.\n","authors":["Ke Zhang","Neman Abdoli","Patrik Gilley","Youkabed Sadri","Xuxin Chen","Theresa C. Thai","Lauren Dockery","Kathleen Moore","Robert S. Mannel","Yuchen Qiu"],"pdf_url":"https://arxiv.org/pdf/2309.07087v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03178v1","updated":"2024-07-03T14:58:40Z","published":"2024-07-03T14:58:40Z","title":"Relating CNN-Transformer Fusion Network for Change Detection","summary":" While deep learning, particularly convolutional neural networks (CNNs), has\nrevolutionized remote sensing (RS) change detection (CD), existing approaches\noften miss crucial features due to neglecting global context and incomplete\nchange learning. Additionally, transformer networks struggle with low-level\ndetails. RCTNet addresses these limitations by introducing \\textbf{(1)} an\nearly fusion backbone to exploit both spatial and temporal features early on,\n\\textbf{(2)} a Cross-Stage Aggregation (CSA) module for enhanced temporal\nrepresentation, \\textbf{(3)} a Multi-Scale Feature Fusion (MSF) module for\nenriched feature extraction in the decoder, and \\textbf{(4)} an Efficient\nSelf-deciphering Attention (ESA) module utilizing transformers to capture\nglobal information and fine-grained details for accurate change detection.\nExtensive experiments demonstrate RCTNet's clear superiority over traditional\nRS image CD methods, showing significant improvement and an optimal balance\nbetween accuracy and computational cost.\n","authors":["Yuhao Gao","Gensheng Pei","Mengmeng Sheng","Zeren Sun","Tao Chen","Yazhou Yao"],"pdf_url":"https://arxiv.org/pdf/2407.03178v1.pdf","comment":"accepted by IEEE Conference on Multimedia Expo"},{"id":"http://arxiv.org/abs/2407.01930v2","updated":"2024-07-03T14:51:09Z","published":"2024-07-02T03:49:48Z","title":"Self-Cooperation Knowledge Distillation for Novel Class Discovery","summary":" Novel Class Discovery (NCD) aims to discover unknown and novel classes in an\nunlabeled set by leveraging knowledge already learned about known classes.\nExisting works focus on instance-level or class-level knowledge representation\nand build a shared representation space to achieve performance improvements.\nHowever, a long-neglected issue is the potential imbalanced number of samples\nfrom known and novel classes, pushing the model towards dominant classes.\nTherefore, these methods suffer from a challenging trade-off between reviewing\nknown classes and discovering novel classes. Based on this observation, we\npropose a Self-Cooperation Knowledge Distillation (SCKD) method to utilize each\ntraining sample (whether known or novel, labeled or unlabeled) for both review\nand discovery. Specifically, the model's feature representations of known and\nnovel classes are used to construct two disjoint representation spaces. Through\nspatial mutual information, we design a self-cooperation learning to encourage\nmodel learning from the two feature representation spaces from itself.\nExtensive experiments on six datasets demonstrate that our method can achieve\nsignificant performance improvements, achieving state-of-the-art performance.\n","authors":["Yuzheng Wang","Zhaoyu Chen","Dingkang Yang","Yunquan Sun","Lizhe Qi"],"pdf_url":"https://arxiv.org/pdf/2407.01930v2.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2407.03172v1","updated":"2024-07-03T14:47:18Z","published":"2024-07-03T14:47:18Z","title":"IMC 2024 Methods & Solutions Review","summary":" For the past three years, Kaggle has been hosting the Image Matching\nChallenge, which focuses on solving a 3D image reconstruction problem using a\ncollection of 2D images. Each year, this competition fosters the development of\ninnovative and effective methodologies by its participants. In this paper, we\nintroduce an advanced ensemble technique that we developed, achieving a score\nof 0.153449 on the private leaderboard and securing the 160th position out of\nover 1,000 participants. Additionally, we conduct a comprehensive review of\nexisting methods and techniques employed by top-performing teams in the\ncompetition. Our solution, alongside the insights gathered from other leading\napproaches, contributes to the ongoing advancement in the field of 3D image\nreconstruction. This research provides valuable knowledge for future\nparticipants and researchers aiming to excel in similar image matching and\nreconstruction challenges.\n","authors":["Shyam Gupta","Dhanisha Sharma","Songling Huang"],"pdf_url":"https://arxiv.org/pdf/2407.03172v1.pdf","comment":"8 Pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.03168v1","updated":"2024-07-03T14:41:39Z","published":"2024-07-03T14:41:39Z","title":"LivePortrait: Efficient Portrait Animation with Stitching and\n Retargeting Control","summary":" Portrait Animation aims to synthesize a lifelike video from a single source\nimage, using it as an appearance reference, with motion (i.e., facial\nexpressions and head pose) derived from a driving video, audio, text, or\ngeneration. Instead of following mainstream diffusion-based methods, we explore\nand extend the potential of the implicit-keypoint-based framework, which\neffectively balances computational efficiency and controllability. Building\nupon this, we develop a video-driven portrait animation framework named\nLivePortrait with a focus on better generalization, controllability, and\nefficiency for practical usage. To enhance the generation quality and\ngeneralization ability, we scale up the training data to about 69 million\nhigh-quality frames, adopt a mixed image-video training strategy, upgrade the\nnetwork architecture, and design better motion transformation and optimization\nobjectives. Additionally, we discover that compact implicit keypoints can\neffectively represent a kind of blendshapes and meticulously propose a\nstitching and two retargeting modules, which utilize a small MLP with\nnegligible computational overhead, to enhance the controllability. Experimental\nresults demonstrate the efficacy of our framework even compared to\ndiffusion-based methods. The generation speed remarkably reaches 12.8ms on an\nRTX 4090 GPU with PyTorch. The inference code and models are available at\nhttps://github.com/KwaiVGI/LivePortrait\n","authors":["Jianzhu Guo","Dingyun Zhang","Xiaoqiang Liu","Zhizhou Zhong","Yuan Zhang","Pengfei Wan","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03165v1","updated":"2024-07-03T14:40:21Z","published":"2024-07-03T14:40:21Z","title":"Consistent Point Orientation for Manifold Surfaces via Boundary\n Integration","summary":" This paper introduces a new approach for generating globally consistent\nnormals for point clouds sampled from manifold surfaces. Given that the\ngeneralized winding number (GWN) field generated by a point cloud with globally\nconsistent normals is a solution to a PDE with jump boundary conditions and\npossesses harmonic properties, and the Dirichlet energy of the GWN field can be\ndefined as an integral over the boundary surface, we formulate a boundary\nenergy derived from the Dirichlet energy of the GWN. Taking as input a point\ncloud with randomly oriented normals, we optimize this energy to restore the\nglobal harmonicity of the GWN field, thereby recovering the globally consistent\nnormals. Experiments show that our method outperforms state-of-the-art\napproaches, exhibiting enhanced robustness to noise, outliers, complex\ntopologies, and thin structures. Our code can be found at\n\\url{https://github.com/liuweizhou319/BIM}.\n","authors":["Weizhou Liu","Xingce Wang","Haichuan Zhao","Xingfei Xue","Zhongke Wu","Xuequan Lu","Ying He"],"pdf_url":"https://arxiv.org/pdf/2407.03165v1.pdf","comment":"accepted in siggraph2024"},{"id":"http://arxiv.org/abs/2407.03163v1","updated":"2024-07-03T14:36:07Z","published":"2024-07-03T14:36:07Z","title":"Global Context Modeling in YOLOv8 for Pediatric Wrist Fracture Detection","summary":" Children often suffer wrist injuries in daily life, while fracture injuring\nradiologists usually need to analyze and interpret X-ray images before surgical\ntreatment by surgeons. The development of deep learning has enabled neural\nnetwork models to work as computer-assisted diagnosis (CAD) tools to help\ndoctors and experts in diagnosis. Since the YOLOv8 models have obtained the\nsatisfactory success in object detection tasks, it has been applied to fracture\ndetection. The Global Context (GC) block effectively models the global context\nin a lightweight way, and incorporating it into YOLOv8 can greatly improve the\nmodel performance. This paper proposes the YOLOv8+GC model for fracture\ndetection, which is an improved version of the YOLOv8 model with the GC block.\nExperimental results demonstrate that compared to the original YOLOv8 model,\nthe proposed YOLOv8-GC model increases the mean average precision calculated at\nintersection over union threshold of 0.5 (mAP 50) from 63.58% to 66.32% on the\nGRAZPEDWRI-DX dataset, achieving the state-of-the-art (SOTA) level. The\nimplementation code for this work is available on GitHub at\nhttps://github.com/RuiyangJu/YOLOv8_Global_Context_Fracture_Detection.\n","authors":["Rui-Yang Ju","Chun-Tse Chien","Chia-Min Lin","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2407.03163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03162v1","updated":"2024-07-03T14:35:35Z","published":"2024-07-03T14:35:35Z","title":"Bunny-VisionPro: Real-Time Bimanual Dexterous Teleoperation for\n Imitation Learning","summary":" Teleoperation is a crucial tool for collecting human demonstrations, but\ncontrolling robots with bimanual dexterous hands remains a challenge. Existing\nteleoperation systems struggle to handle the complexity of coordinating two\nhands for intricate manipulations. We introduce Bunny-VisionPro, a real-time\nbimanual dexterous teleoperation system that leverages a VR headset. Unlike\nprevious vision-based teleoperation systems, we design novel low-cost devices\nto provide haptic feedback to the operator, enhancing immersion. Our system\nprioritizes safety by incorporating collision and singularity avoidance while\nmaintaining real-time performance through innovative designs. Bunny-VisionPro\noutperforms prior systems on a standard task suite, achieving higher success\nrates and reduced task completion times. Moreover, the high-quality\nteleoperation demonstrations improve downstream imitation learning performance,\nleading to better generalizability. Notably, Bunny-VisionPro enables imitation\nlearning with challenging multi-stage, long-horizon dexterous manipulation\ntasks, which have rarely been addressed in previous work. Our system's ability\nto handle bimanual manipulations while prioritizing safety and real-time\nperformance makes it a powerful tool for advancing dexterous manipulation and\nimitation learning.\n","authors":["Runyu Ding","Yuzhe Qin","Jiyue Zhu","Chengzhe Jia","Shiqi Yang","Ruihan Yang","Xiaojuan Qi","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03162v1.pdf","comment":"project page: https://dingry.github.io/projects/bunny_visionpro.html"},{"id":"http://arxiv.org/abs/2407.03152v1","updated":"2024-07-03T14:30:47Z","published":"2024-07-03T14:30:47Z","title":"Stereo Risk: A Continuous Modeling Approach to Stereo Matching","summary":" We introduce Stereo Risk, a new deep-learning approach to solve the classical\nstereo-matching problem in computer vision. As it is well-known that stereo\nmatching boils down to a per-pixel disparity estimation problem, the popular\nstate-of-the-art stereo-matching approaches widely rely on regressing the scene\ndisparity values, yet via discretization of scene disparity values. Such\ndiscretization often fails to capture the nuanced, continuous nature of scene\ndepth. Stereo Risk departs from the conventional discretization approach by\nformulating the scene disparity as an optimal solution to a continuous risk\nminimization problem, hence the name \"stereo risk\". We demonstrate that $L^1$\nminimization of the proposed continuous risk function enhances stereo-matching\nperformance for deep networks, particularly for disparities with multi-modal\nprobability distributions. Furthermore, to enable the end-to-end network\ntraining of the non-differentiable $L^1$ risk optimization, we exploited the\nimplicit function theorem, ensuring a fully differentiable network. A\ncomprehensive analysis demonstrates our method's theoretical soundness and\nsuperior performance over the state-of-the-art methods across various benchmark\ndatasets, including KITTI 2012, KITTI 2015, ETH3D, SceneFlow, and Middlebury\n2014.\n","authors":["Ce Liu","Suryansh Kumar","Shuhang Gu","Radu Timofte","Yao Yao","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2407.03152v1.pdf","comment":"Accepted as an Oral Paper at ICML 2024. Draft info: 18 pages, 6\n Figure, 16 Tables"},{"id":"http://arxiv.org/abs/2208.10607v4","updated":"2024-07-03T14:23:16Z","published":"2022-08-22T21:26:57Z","title":"Individual Tree Detection in Large-Scale Urban Environments using\n High-Resolution Multispectral Imagery","summary":" We introduce a novel deep learning method for detection of individual trees\nin urban environments using high-resolution multispectral aerial imagery. We\nuse a convolutional neural network to regress a confidence map indicating the\nlocations of individual trees, which are localized using a peak finding\nalgorithm. Our method provides complete spatial coverage by detecting trees in\nboth public and private spaces, and can scale to very large areas. We performed\na thorough evaluation of our method, supported by a new dataset of over 1,500\nimages and almost 100,000 tree annotations, covering eight cities, six climate\nzones, and three image capture years. We trained our model on data from\nSouthern California, and achieved a precision of 73.6% and recall of 73.3%\nusing test data from this region. We generally observed similar precision and\nslightly lower recall when extrapolating to other California climate zones and\nimage capture dates. We used our method to produce a map of trees in the entire\nurban forest of California, and estimated the total number of urban trees in\nCalifornia to be about 43.5 million. Our study indicates the potential for deep\nlearning methods to support future urban forestry studies at unprecedented\nscales.\n","authors":["Jonathan Ventura","Camille Pawlak","Milo Honsberger","Cameron Gonsalves","Julian Rice","Natalie L. R. Love","Skyler Han","Viet Nguyen","Keilana Sugano","Jacqueline Doremus","G. Andrew Fricker","Jenn Yost","Matt Ritter"],"pdf_url":"https://arxiv.org/pdf/2208.10607v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03144v1","updated":"2024-07-03T14:22:51Z","published":"2024-07-03T14:22:51Z","title":"Venomancer: Towards Imperceptible and Target-on-Demand Backdoor Attacks\n in Federated Learning","summary":" Federated Learning (FL) is a distributed machine learning approach that\nmaintains data privacy by training on decentralized data sources. Similar to\ncentralized machine learning, FL is also susceptible to backdoor attacks. Most\nbackdoor attacks in FL assume a predefined target class and require control\nover a large number of clients or knowledge of benign clients' information.\nFurthermore, they are not imperceptible and are easily detected by human\ninspection due to clear artifacts left on the poison data. To overcome these\nchallenges, we propose Venomancer, an effective backdoor attack that is\nimperceptible and allows target-on-demand. Specifically, imperceptibility is\nachieved by using a visual loss function to make the poison data visually\nindistinguishable from the original data. Target-on-demand property allows the\nattacker to choose arbitrary target classes via conditional adversarial\ntraining. Additionally, experiments showed that the method is robust against\nstate-of-the-art defenses such as Norm Clipping, Weak DP, Krum, and Multi-Krum.\nThe source code is available at\nhttps://anonymous.4open.science/r/Venomancer-3426.\n","authors":["Son Nguyen","Thinh Nguyen","Khoa Doan","Kok-Seng Wong"],"pdf_url":"https://arxiv.org/pdf/2407.03144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03140v1","updated":"2024-07-03T14:20:24Z","published":"2024-07-03T14:20:24Z","title":"Machine Learning Models for Improved Tracking from Range-Doppler Map\n Images","summary":" Statistical tracking filters depend on accurate target measurements and\nuncertainty estimates for good tracking performance. In this work, we propose\nnovel machine learning models for target detection and uncertainty estimation\nin range-Doppler map (RDM) images for Ground Moving Target Indicator (GMTI)\nradars. We show that by using the outputs of these models, we can significantly\nimprove the performance of a multiple hypothesis tracker for complex\nmulti-target air-to-ground tracking scenarios.\n","authors":["Elizabeth Hou","Ross Greenwood","Piyush Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.03140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03130v1","updated":"2024-07-03T14:12:43Z","published":"2024-07-03T14:12:43Z","title":"Towards Efficient Pixel Labeling for Industrial Anomaly Detection and\n Localization","summary":" In the realm of practical Anomaly Detection (AD) tasks, manual labeling of\nanomalous pixels proves to be a costly endeavor. Consequently, many AD methods\nare crafted as one-class classifiers, tailored for training sets completely\ndevoid of anomalies, ensuring a more cost-effective approach. While some\npioneering work has demonstrated heightened AD accuracy by incorporating real\nanomaly samples in training, this enhancement comes at the price of\nlabor-intensive labeling processes. This paper strikes the balance between AD\naccuracy and labeling expenses by introducing ADClick, a novel Interactive\nImage Segmentation (IIS) algorithm. ADClick efficiently generates\n\"ground-truth\" anomaly masks for real defective images, leveraging innovative\nresidual features and meticulously crafted language prompts. Notably, ADClick\nshowcases a significantly elevated generalization capacity compared to existing\nstate-of-the-art IIS approaches. Functioning as an anomaly labeling tool,\nADClick generates high-quality anomaly labels (AP $= 94.1\\%$ on MVTec AD) based\non only $3$ to $5$ manual click annotations per training image. Furthermore, we\nextend the capabilities of ADClick into ADClick-Seg, an enhanced model designed\nfor anomaly detection and localization. By fine-tuning the ADClick-Seg model\nusing the weak labels inferred by ADClick, we establish the state-of-the-art\nperformances in supervised AD tasks (AP $= 86.4\\%$ on MVTec AD and AP $=\n78.4\\%$, PRO $= 98.6\\%$ on KSDD2).\n","authors":["Hanxi Li","Jingqi Wu","Lin Yuanbo","Hao Chen","Deyin Liu","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2407.03130v1.pdf","comment":"18 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.07741v3","updated":"2024-07-03T14:08:22Z","published":"2024-06-11T21:55:20Z","title":"Back to the Color: Learning Depth to Specific Color Transformation for\n Unsupervised Depth Estimation","summary":" Virtual engines can generate dense depth maps for various synthetic scenes,\nmaking them invaluable for training depth estimation models. However,\ndiscrepancies between synthetic and real-world colors pose significant\nchallenges for depth estimation in real-world scenes, especially in complex and\nuncertain environments encountered in unsupervised monocular depth estimation\ntasks. To address this issue, we propose Back2Color, a framework that predicts\nrealistic colors from depth using a model trained on real-world data, thus\ntransforming synthetic colors into their real-world counterparts. Additionally,\nwe introduce the Syn-Real CutMix method for joint training with both real-world\nunsupervised and synthetic supervised depth samples, enhancing monocular depth\nestimation performance in real-world scenes. Furthermore, to mitigate the\nimpact of non-rigid motions on depth estimation, we present an auto-learning\nuncertainty temporal-spatial fusion method (Auto-UTSF), which leverages the\nstrengths of unsupervised learning in both temporal and spatial dimensions. We\nalso designed VADepth, based on the Vision Attention Network, which offers\nlower computational complexity and higher accuracy than transformers. Our\nBack2Color framework achieves state-of-the-art performance on the Kitti\ndataset, as evidenced by improvements in performance metrics and the production\nof fine-grained details. This is particularly evident on more challenging\ndatasets such as Cityscapes for unsupervised depth estimation.\n","authors":["Yufan Zhu","Chongzhi Ran","Mingtao Feng","Fangfang Wu","Le Dong","Weisheng Dong","Antonio M. López","Guangming Shi"],"pdf_url":"https://arxiv.org/pdf/2406.07741v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00219v2","updated":"2024-07-03T14:04:25Z","published":"2024-03-01T01:28:10Z","title":"Multi-modal Attribute Prompting for Vision-Language Models","summary":" Large pre-trained Vision-Language Models (VLMs), like CLIP, exhibit strong\ngeneralization ability to downstream tasks but struggle in few-shot scenarios.\nExisting prompting techniques primarily focus on global text and image\nrepresentations, yet overlooking multi-modal attribute characteristics. This\nlimitation hinders the model's ability to perceive fine-grained visual details\nand restricts its generalization ability to a broader range of unseen classes.\nTo address this issue, we propose a Multi-modal Attribute Prompting method\n(MAP) by jointly exploring textual attribute prompting, visual attribute\nprompting, and attribute-level alignment. The proposed MAP enjoys several\nmerits. First, we introduce learnable visual attribute prompts enhanced by\ntextual attribute semantics to adaptively capture visual attributes for images\nfrom unknown categories, boosting fine-grained visual perception capabilities\nfor CLIP. Second, the proposed attribute-level alignment complements the global\nalignment to enhance the robustness of cross-modal alignment for\nopen-vocabulary objects. To our knowledge, this is the first work to establish\ncross-modal attribute-level alignment for CLIP-based few-shot adaptation.\nExtensive experimental results on 11 datasets demonstrate that our method\nperforms favorably against state-of-the-art approaches.\n","authors":["Xin Liu","Jiamin Wu","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.00219v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04561v2","updated":"2024-07-03T14:01:52Z","published":"2023-09-08T19:27:01Z","title":"Four Ways to Improve Verbo-visual Fusion for Dense 3D Visual Grounding","summary":" 3D visual grounding is the task of localizing the object in a 3D scene which\nis referred by a description in natural language. With a wide range of\napplications ranging from autonomous indoor robotics to AR/VR, the task has\nrecently risen in popularity. A common formulation to tackle 3D visual\ngrounding is grounding-by-detection, where localization is done via bounding\nboxes. However, for real-life applications that require physical interactions,\na bounding box insufficiently describes the geometry of an object. We therefore\ntackle the problem of dense 3D visual grounding, i.e. referral-based 3D\ninstance segmentation. We propose a dense 3D grounding network ConcreteNet,\nfeaturing four novel stand-alone modules that aim to improve grounding\nperformance for challenging repetitive instances, i.e. instances with\ndistractors of the same semantic class. First, we introduce a bottom-up\nattentive fusion module that aims to disambiguate inter-instance relational\ncues, next, we construct a contrastive training scheme to induce separation in\nthe latent space, we then resolve view-dependent utterances via a learned\nglobal camera token, and finally we employ multi-view ensembling to improve\nreferred mask quality. ConcreteNet ranks 1st on the challenging ScanRefer\nonline benchmark and has won the ICCV 3rd Workshop on Language for 3D Scenes\n\"3D Object Localization\" challenge.\n","authors":["Ozan Unal","Christos Sakaridis","Suman Saha","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2309.04561v2.pdf","comment":"Winner of the ICCV 2023 ScanRefer Challenge. Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2407.03115v1","updated":"2024-07-03T14:00:33Z","published":"2024-07-03T14:00:33Z","title":"$L_p$-norm Distortion-Efficient Adversarial Attack","summary":" Adversarial examples have shown a powerful ability to make a well-trained\nmodel misclassified. Current mainstream adversarial attack methods only\nconsider one of the distortions among $L_0$-norm, $L_2$-norm, and\n$L_\\infty$-norm. $L_0$-norm based methods cause large modification on a single\npixel, resulting in naked-eye visible detection, while $L_2$-norm and\n$L_\\infty$-norm based methods suffer from weak robustness against adversarial\ndefense since they always diffuse tiny perturbations to all pixels. A more\nrealistic adversarial perturbation should be sparse and imperceptible. In this\npaper, we propose a novel $L_p$-norm distortion-efficient adversarial attack,\nwhich not only owns the least $L_2$-norm loss but also significantly reduces\nthe $L_0$-norm distortion. To this aim, we design a new optimization scheme,\nwhich first optimizes an initial adversarial perturbation under $L_2$-norm\nconstraint, and then constructs a dimension unimportance matrix for the initial\nperturbation. Such a dimension unimportance matrix can indicate the adversarial\nunimportance of each dimension of the initial perturbation. Furthermore, we\nintroduce a new concept of adversarial threshold for the dimension unimportance\nmatrix. The dimensions of the initial perturbation whose unimportance is higher\nthan the threshold will be all set to zero, greatly decreasing the $L_0$-norm\ndistortion. Experimental results on three benchmark datasets show that under\nthe same query budget, the adversarial examples generated by our method have\nlower $L_0$-norm and $L_2$-norm distortion than the state-of-the-art.\nEspecially for the MNIST dataset, our attack reduces 8.1$\\%$ $L_2$-norm\ndistortion meanwhile remaining 47$\\%$ pixels unattacked. This demonstrates the\nsuperiority of the proposed method over its competitors in terms of adversarial\nrobustness and visual imperceptibility.\n","authors":["Chao Zhou","Yuan-Gen Wang","Zi-jia Wang","Xiangui Kang"],"pdf_url":"https://arxiv.org/pdf/2407.03115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02261v2","updated":"2024-07-03T13:57:47Z","published":"2024-07-02T13:38:06Z","title":"Federated Distillation for Medical Image Classification: Towards\n Trustworthy Computer-Aided Diagnosis","summary":" Medical image classification plays a crucial role in computer-aided clinical\ndiagnosis. While deep learning techniques have significantly enhanced\nefficiency and reduced costs, the privacy-sensitive nature of medical imaging\ndata complicates centralized storage and model training. Furthermore,\nlow-resource healthcare organizations face challenges related to communication\noverhead and efficiency due to increasing data and model scales. This paper\nproposes a novel privacy-preserving medical image classification framework\nbased on federated learning to address these issues, named FedMIC. The\nframework enables healthcare organizations to learn from both global and local\nknowledge, enhancing local representation of private data despite statistical\nheterogeneity. It provides customized models for organizations with diverse\ndata distributions while minimizing communication overhead and improving\nefficiency without compromising performance. Our FedMIC enhances robustness and\npractical applicability under resource-constrained conditions. We demonstrate\nFedMIC's effectiveness using four public medical image datasets for classical\nmedical image classification tasks.\n","authors":["Sufen Ren","Yule Hu","Shengchao Chen","Guanjun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02261v2.pdf","comment":"Work in progress. This paper is the first to introduce intra-client\n knowledge distillation in the context of trustworthy medical image\n classification. arXiv admin note: text overlap with arXiv:2401.01493"},{"id":"http://arxiv.org/abs/2407.03106v1","updated":"2024-07-03T13:44:20Z","published":"2024-07-03T13:44:20Z","title":"Anti-Collapse Loss for Deep Metric Learning Based on Coding Rate Metric","summary":" Deep metric learning (DML) aims to learn a discriminative high-dimensional\nembedding space for downstream tasks like classification, clustering, and\nretrieval. Prior literature predominantly focuses on pair-based and proxy-based\nmethods to maximize inter-class discrepancy and minimize intra-class diversity.\nHowever, these methods tend to suffer from the collapse of the embedding space\ndue to their over-reliance on label information. This leads to sub-optimal\nfeature representation and inferior model performance. To maintain the\nstructure of embedding space and avoid feature collapse, we propose a novel\nloss function called Anti-Collapse Loss. Specifically, our proposed loss\nprimarily draws inspiration from the principle of Maximal Coding Rate\nReduction. It promotes the sparseness of feature clusters in the embedding\nspace to prevent collapse by maximizing the average coding rate of sample\nfeatures or class proxies. Moreover, we integrate our proposed loss with\npair-based and proxy-based methods, resulting in notable performance\nimprovement. Comprehensive experiments on benchmark datasets demonstrate that\nour proposed method outperforms existing state-of-the-art methods. Extensive\nablation studies verify the effectiveness of our method in preventing embedding\nspace collapse and promoting generalization performance.\n","authors":["Xiruo Jiang","Yazhou Yao","Xili Dai","Fumin Shen","Xian-Sheng Hua","Heng-Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2407.03106v1.pdf","comment":"accepted by IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2407.03104v1","updated":"2024-07-03T13:41:44Z","published":"2024-07-03T13:41:44Z","title":"KeyVideoLLM: Towards Large-scale Video Keyframe Selection","summary":" Recently, with the rise of web videos, managing and understanding large-scale\nvideo datasets has become increasingly important. Video Large Language Models\n(VideoLLMs) have emerged in recent years due to their strong video\nunderstanding capabilities. However, training and inference processes for\nVideoLLMs demand vast amounts of data, presenting significant challenges to\ndata management, particularly regarding efficiency, robustness, and\neffectiveness. In this work, we present KeyVideoLLM, a text-video frame\nsimilarity-based keyframe selection method designed to manage VideoLLM data\nefficiently, robustly, and effectively. Specifically, KeyVideoLLM achieves a\nremarkable data compression rate of up to 60.9 times, substantially lowering\ndisk space requirements, which proves its high efficiency. Additionally, it\nmaintains a 100% selection success rate across all video formats and scales,\nenhances processing speed by up to 200 times compared to existing keyframe\nselection methods, and does not require hyperparameter tuning. Beyond its\noutstanding efficiency and robustness, KeyVideoLLM further improves model\nperformance in video question-answering tasks during both training and\ninference stages. Notably, it consistently achieved the state-of-the-art (SoTA)\nexperimental results on diverse datasets.\n","authors":["Hao Liang","Jiapeng Li","Tianyi Bai","Chong Chen","Conghui He","Bin Cui","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05422v2","updated":"2024-07-03T13:37:00Z","published":"2024-05-08T20:46:36Z","title":"EarthMatch: Iterative Coregistration for Fine-grained Localization of\n Astronaut Photography","summary":" Precise, pixel-wise geolocalization of astronaut photography is critical to\nunlocking the potential of this unique type of remotely sensed Earth data,\nparticularly for its use in disaster management and climate change research.\nRecent works have established the Astronaut Photography Localization task, but\nhave either proved too costly for mass deployment or generated too coarse a\nlocalization. Thus, we present EarthMatch, an iterative homography estimation\nmethod that produces fine-grained localization of astronaut photographs while\nmaintaining an emphasis on speed. We refocus the astronaut photography\nbenchmark, AIMS, on the geolocalization task itself, and prove our method's\nefficacy on this dataset. In addition, we offer a new, fair method for image\nmatcher comparison, and an extensive evaluation of different matching models\nwithin our localization pipeline. Our method will enable fast and accurate\nlocalization of the 4.5 million and growing collection of astronaut photography\nof Earth. Webpage with code and data at\nhttps://earthloc-and-earthmatch.github.io\n","authors":["Gabriele Berton","Gabriele Goletto","Gabriele Trivigno","Alex Stoken","Barbara Caputo","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2405.05422v2.pdf","comment":"CVPR 2024 IMW - webpage: https://earthloc-and-earthmatch.github.io"},{"id":"http://arxiv.org/abs/2407.02174v2","updated":"2024-07-03T13:17:05Z","published":"2024-07-02T11:28:22Z","title":"BeNeRF: Neural Radiance Fields from a Single Blurry Image and Event\n Stream","summary":" Neural implicit representation of visual scenes has attracted a lot of\nattention in recent research of computer vision and graphics. Most prior\nmethods focus on how to reconstruct 3D scene representation from a set of\nimages. In this work, we demonstrate the possibility to recover the neural\nradiance fields (NeRF) from a single blurry image and its corresponding event\nstream. We model the camera motion with a cubic B-Spline in SE(3) space. Both\nthe blurry image and the brightness change within a time interval, can then be\nsynthesized from the 3D scene representation given the 6-DoF poses interpolated\nfrom the cubic B-Spline. Our method can jointly learn both the implicit neural\nscene representation and recover the camera motion by minimizing the\ndifferences between the synthesized data and the real measurements without\npre-computed camera poses from COLMAP. We evaluate the proposed method with\nboth synthetic and real datasets. The experimental results demonstrate that we\nare able to render view-consistent latent sharp images from the learned NeRF\nand bring a blurry image alive in high quality. Code and data are available at\nhttps://github.com/WU-CVGL/BeNeRF.\n","authors":["Wenpu Li","Pian Wan","Peng Wang","Jinghang Li","Yi Zhou","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02174v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2403.08857v2","updated":"2024-07-03T13:09:53Z","published":"2024-03-13T18:00:01Z","title":"DialogGen: Multi-modal Interactive Dialogue System for Multi-turn\n Text-to-Image Generation","summary":" Text-to-image (T2I) generation models have significantly advanced in recent\nyears. However, effective interaction with these models is challenging for\naverage users due to the need for specialized prompt engineering knowledge and\nthe inability to perform multi-turn image generation, hindering a dynamic and\niterative creation process. Recent attempts have tried to equip Multi-modal\nLarge Language Models (MLLMs) with T2I models to bring the user's natural\nlanguage instructions into reality. Hence, the output modality of MLLMs is\nextended, and the multi-turn generation quality of T2I models is enhanced\nthanks to the strong multi-modal comprehension ability of MLLMs. However, many\nof these works face challenges in identifying correct output modalities and\ngenerating coherent images accordingly as the number of output modalities\nincreases and the conversations go deeper. Therefore, we propose DialogGen, an\neffective pipeline to align off-the-shelf MLLMs and T2I models to build a\nMulti-modal Interactive Dialogue System (MIDS) for multi-turn Text-to-Image\ngeneration. It is composed of drawing prompt alignment, careful training data\ncuration, and error correction. Moreover, as the field of MIDS flourishes,\ncomprehensive benchmarks are urgently needed to evaluate MIDS fairly in terms\nof output modality correctness and multi-modal output coherence. To address\nthis issue, we introduce the Multi-modal Dialogue Benchmark (DialogBen), a\ncomprehensive bilingual benchmark designed to assess the ability of MLLMs to\ngenerate accurate and coherent multi-modal content that supports image editing.\nIt contains two evaluation metrics to measure the model's ability to switch\nmodalities and the coherence of the output images. Our extensive experiments on\nDialogBen and user study demonstrate the effectiveness of DialogGen compared\nwith other State-of-the-Art models.\n","authors":["Minbin Huang","Yanxin Long","Xinchi Deng","Ruihang Chu","Jiangfeng Xiong","Xiaodan Liang","Hong Cheng","Qinglin Lu","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.08857v2.pdf","comment":"Project page: https://hunyuan-dialoggen.github.io/"},{"id":"http://arxiv.org/abs/2407.01996v2","updated":"2024-07-03T12:42:47Z","published":"2024-07-02T07:10:10Z","title":"ViG-Bias: Visually Grounded Bias Discovery and Mitigation","summary":" The proliferation of machine learning models in critical decision making\nprocesses has underscored the need for bias discovery and mitigation\nstrategies. Identifying the reasons behind a biased system is not\nstraightforward, since in many occasions they are associated with hidden\nspurious correlations which are not easy to spot. Standard approaches rely on\nbias audits performed by analyzing model performance in pre-defined subgroups\nof data samples, usually characterized by common attributes like gender or\nethnicity when it comes to people, or other specific attributes defining\nsemantically coherent groups of images. However, it is not always possible to\nknow a-priori the specific attributes defining the failure modes of visual\nrecognition systems. Recent approaches propose to discover these groups by\nleveraging large vision language models, which enable the extraction of\ncross-modal embeddings and the generation of textual descriptions to\ncharacterize the subgroups where a certain model is underperforming. In this\nwork, we argue that incorporating visual explanations (e.g. heatmaps generated\nvia GradCAM or other approaches) can boost the performance of such bias\ndiscovery and mitigation frameworks. To this end, we introduce Visually\nGrounded Bias Discovery and Mitigation (ViG-Bias), a simple yet effective\ntechnique which can be integrated to a variety of existing frameworks to\nimprove both, discovery and mitigation performance. Our comprehensive\nevaluation shows that incorporating visual explanations enhances existing\ntechniques like DOMINO, FACTS and Bias-to-Text, across several challenging\ndatasets, including CelebA, Waterbirds, and NICO++.\n","authors":["Badr-Eddine Marani","Mohamed Hanini","Nihitha Malayarukil","Stergios Christodoulidis","Maria Vakalopoulou","Enzo Ferrante"],"pdf_url":"https://arxiv.org/pdf/2407.01996v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.03056v1","updated":"2024-07-03T12:24:40Z","published":"2024-07-03T12:24:40Z","title":"Improving Zero-shot Generalization of Learned Prompts via Unsupervised\n Knowledge Distillation","summary":" Vision-Language Models (VLMs) demonstrate remarkable zero-shot generalization\nto unseen tasks, but fall short of the performance of supervised methods in\ngeneralizing to downstream tasks with limited data. Prompt learning is emerging\nas a parameter-efficient method for adapting VLMs, but state-of-the-art\napproaches require annotated samples. In this paper we propose a novel approach\nto prompt learning based on unsupervised knowledge distillation from more\npowerful models. Our approach, which we call Knowledge Distillation Prompt\nLearning (KDPL), can be integrated into existing prompt learning techniques and\neliminates the need for labeled examples during adaptation. Our experiments on\nmore than ten standard benchmark datasets demonstrate that KDPL is very\neffective at improving generalization of learned prompts for zero-shot domain\ngeneralization, zero-shot cross-dataset generalization, and zero-shot\nbase-to-novel class generalization problems. KDPL requires no ground-truth\nlabels for adaptation, and moreover we show that even in the absence of any\nknowledge of training class names it can be used to effectively transfer\nknowledge. The code is publicly available at https://github.com/miccunifi/KDPL.\n","authors":["Marco Mistretta","Alberto Baldrati","Marco Bertini","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2407.03056v1.pdf","comment":"Accepted for publication at ECCV24"},{"id":"http://arxiv.org/abs/2407.03043v1","updated":"2024-07-03T12:07:36Z","published":"2024-07-03T12:07:36Z","title":"SlerpFace: Face Template Protection via Spherical Linear Interpolation","summary":" Contemporary face recognition systems use feature templates extracted from\nface images to identify persons. To enhance privacy, face template protection\ntechniques are widely employed to conceal sensitive identity and appearance\ninformation stored in the template. This paper identifies an emerging privacy\nattack form utilizing diffusion models that could nullify prior protection,\nreferred to as inversion attacks. The attack can synthesize high-quality,\nidentity-preserving face images from templates, revealing persons' appearance.\nBased on studies of the diffusion model's generative capability, this paper\nproposes a defense to deteriorate the attack, by rotating templates to a\nnoise-like distribution. This is achieved efficiently by spherically and\nlinearly interpolating templates, or slerp, on their located hypersphere. This\npaper further proposes to group-wisely divide and drop out templates' feature\ndimensions, to enhance the irreversibility of rotated templates. The division\nof groups and dropouts within each group are learned in a recognition-favored\nway. The proposed techniques are concretized as a novel face template\nprotection technique, SlerpFace. Extensive experiments show that SlerpFace\nprovides satisfactory recognition accuracy and comprehensive privacy protection\nagainst inversion and other attack forms, superior to prior arts.\n","authors":["Zhizhou Zhong","Yuxi Mi","Yuge Huang","Jianqing Xu","Guodong Mu","Shouhong Ding","Jingyun Zhang","Rizen Guo","Yunsheng Wu","Shuigeng Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.03043v1.pdf","comment":"face template protection"},{"id":"http://arxiv.org/abs/2407.03041v1","updated":"2024-07-03T12:06:34Z","published":"2024-07-03T12:06:34Z","title":"Position and Altitude of the Nao Camera Head from Two Points on the\n Soccer Field plus the Gravitational Direction","summary":" To be able to play soccer, a robot needs a good estimate of its current\nposition on the field. Ideally, multiple features are visible that have known\nlocations. By applying trigonometry we can estimate the viewpoint from where\nthis observation was actually made. Given that the Nao robots of the Standard\nPlatform League have quite a limited field of view, a given camera frame\ntypically only allows for one or two points to be recognized.\n In this paper we propose a method for determining the (x, y) coordinates on\nthe field and the height h of the camera from the geometry of a simplified\ntetrahedron. This configuration is formed by two observed points on the ground\nplane plus the gravitational direction. When the distance between the two\npoints is known, and the directions to the points plus the gravitational\ndirection are measured, all dimensions of the tetrahedron can be determined.\n By performing these calculations with rational trigonometry instead of\nclassical trigonometry, the computations turn out to be 28.7% faster, with\nequal numerical accuracy. The position of the head of the Nao can also be\nexternally measured with the OptiTrack system. The difference between\nexternally measured and internally predicted position from sensor data gives us\nmean absolute errors in the 3-6 centimeters range, when we estimated the\ngravitational direction from the vanishing point of the outer edges of the goal\nposts.\n","authors":["Stijn Oomes","Arnoud Visser"],"pdf_url":"https://arxiv.org/pdf/2407.03041v1.pdf","comment":"to be published in the Proceedings of the RoboCup 2024 symposium - 12\n pages"},{"id":"http://arxiv.org/abs/2308.13900v2","updated":"2024-07-03T11:58:22Z","published":"2023-08-26T15:02:00Z","title":"Semi-Supervised Semantic Segmentation via Marginal Contextual\n Information","summary":" We present a novel confidence refinement scheme that enhances pseudo labels\nin semi-supervised semantic segmentation. Unlike existing methods, which filter\npixels with low-confidence predictions in isolation, our approach leverages the\nspatial correlation of labels in segmentation maps by grouping neighboring\npixels and considering their pseudo labels collectively. With this contextual\ninformation, our method, named S4MC, increases the amount of unlabeled data\nused during training while maintaining the quality of the pseudo labels, all\nwith negligible computational overhead. Through extensive experiments on\nstandard benchmarks, we demonstrate that S4MC outperforms existing\nstate-of-the-art semi-supervised learning approaches, offering a promising\nsolution for reducing the cost of acquiring dense annotations. For example,\nS4MC achieves a 1.39 mIoU improvement over the prior art on PASCAL VOC 12 with\n366 annotated images. The code to reproduce our experiments is available at\nhttps://s4mcontext.github.io/\n","authors":["Moshe Kimhi","Shai Kimhi","Evgenii Zheltonozhskii","Or Litany","Chaim Baskin"],"pdf_url":"https://arxiv.org/pdf/2308.13900v2.pdf","comment":"Published at TMLR"},{"id":"http://arxiv.org/abs/2407.03036v1","updated":"2024-07-03T11:56:55Z","published":"2024-07-03T11:56:55Z","title":"SAFT: Towards Out-of-Distribution Generalization in Fine-Tuning","summary":" Handling distribution shifts from training data, known as out-of-distribution\n(OOD) generalization, poses a significant challenge in the field of machine\nlearning. While a pre-trained vision-language model like CLIP has demonstrated\nremarkable zero-shot performance, further adaptation of the model to downstream\ntasks leads to undesirable degradation for OOD data. In this work, we introduce\nSparse Adaptation for Fine-Tuning (SAFT), a method that prevents fine-tuning\nfrom forgetting the general knowledge in the pre-trained model. SAFT only\nupdates a small subset of important parameters whose gradient magnitude is\nlarge, while keeping the other parameters frozen. SAFT is straightforward to\nimplement and conceptually simple. Extensive experiments show that with only\n0.1% of the model parameters, SAFT can significantly improve the performance of\nCLIP. It consistently outperforms baseline methods across several benchmarks.\nOn the few-shot learning benchmark of ImageNet and its variants, SAFT gives a\ngain of 5.15% on average over the conventional fine-tuning method in OOD\nsettings.\n","authors":["Bac Nguyen","Stefan Uhlich","Fabien Cardinaux","Lukas Mauch","Marzieh Edraki","Aaron Courville"],"pdf_url":"https://arxiv.org/pdf/2407.03036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03033v1","updated":"2024-07-03T11:54:17Z","published":"2024-07-03T11:54:17Z","title":"ISWSST: Index-space-wave State Superposition Transformers for\n Multispectral Remotely Sensed Imagery Semantic Segmentation","summary":" Currently the semantic segmentation task of multispectral remotely sensed\nimagery (MSRSI) faces the following problems: 1) Usually, only single domain\nfeature (i.e., space domain or frequency domain) is considered; 2) downsampling\noperation in encoder generally leads to the accuracy loss of edge extraction;\n3) multichannel features of MSRSI are not fully considered; and 4) prior\nknowledge of remote sensing is not fully utilized. To solve the aforementioned\nissues, an index-space-wave state superposition Transformer (ISWSST) is the\nfirst to be proposed for MSRSI semantic segmentation by the inspiration from\nquantum mechanics, whose superiority is as follows: 1) index, space and wave\nstates are superposed or fused to simulate quantum superposition by adaptively\nvoting decision (i.e., ensemble learning idea) for being a stronger classifier\nand improving the segmentation accuracy; 2) a lossless wavelet pyramid\nencoder-decoder module is designed to losslessly reconstruct image and simulate\nquantum entanglement based on wavelet transform and inverse wavelet transform\nfor avoiding the edge extraction loss; 3) combining multispectral features\n(i.e. remote sensing index and channel attention mechanism) is proposed to\naccurately extract ground objects from original resolution images; and 4)\nquantum mechanics are introduced to interpret the underlying superiority of\nISWSST. Experiments show that ISWSST is validated and superior to the\nstate-of-the-art architectures for the MSRSI segmentation task, which improves\nthe segmentation and edge extraction accuracy effectively. Codes will be\navailable publicly after our paper is accepted.\n","authors":["Chang Li","Pengfei Zhang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01209v2","updated":"2024-07-03T11:49:33Z","published":"2023-10-02T13:53:55Z","title":"Self-distilled Masked Attention guided masked image modeling with noise\n Regularized Teacher (SMART) for medical image analysis","summary":" Pretraining vision transformers (ViT) with attention guided masked image\nmodeling (MIM) has shown to increase downstream accuracy for natural image\nanalysis. Hierarchical shifted window (Swin) transformer, often used in medical\nimage analysis cannot use attention guided masking as it lacks an explicit\n[CLS] token, needed for computing attention maps for selective masking. We thus\nenhanced Swin with semantic class attention. We developed a co-distilled Swin\ntransformer that combines a noisy momentum updated teacher to guide selective\nmasking for MIM. Our approach called \\textsc{s}e\\textsc{m}antic\n\\textsc{a}ttention guided co-distillation with noisy teacher\n\\textsc{r}egularized Swin \\textsc{T}rans\\textsc{F}ormer (SMARTFormer) was\napplied for analyzing 3D computed tomography datasets with lung nodules and\nmalignant lung cancers (LC). We also analyzed the impact of semantic attention\nand noisy teacher on pretraining and downstream accuracy. SMARTFormer\nclassified lesions (malignant from benign) with a high accuracy of 0.895 of\n1000 nodules, predicted LC treatment response with accuracy of 0.74, and\nachieved high accuracies even in limited data regimes. Pretraining with\nsemantic attention and noisy teacher improved ability to distinguish\nsemantically meaningful structures such as organs in a unsupervised clustering\ntask and localize abnormal structures like tumors. Code, models will be made\navailable through GitHub upon paper acceptance.\n","authors":["Jue Jiang","Aneesh Rangnekar","Chloe Min Seo Choi","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2310.01209v2.pdf","comment":"Paper is under review at TMI"},{"id":"http://arxiv.org/abs/2406.19081v2","updated":"2024-07-03T11:34:39Z","published":"2024-06-27T11:08:42Z","title":"Unsupervised Latent Stain Adaptation for Computational Pathology","summary":" In computational pathology, deep learning (DL) models for tasks such as\nsegmentation or tissue classification are known to suffer from domain shifts\ndue to different staining techniques. Stain adaptation aims to reduce the\ngeneralization error between different stains by training a model on source\nstains that generalizes to target stains. Despite the abundance of target stain\ndata, a key challenge is the lack of annotations. To address this, we propose a\njoint training between artificially labeled and unlabeled data including all\navailable stained images called Unsupervised Latent Stain Adaptation (ULSA).\nOur method uses stain translation to enrich labeled source images with\nsynthetic target images in order to increase the supervised signals. Moreover,\nwe leverage unlabeled target stain images using stain-invariant feature\nconsistency learning. With ULSA we present a semi-supervised strategy for\nefficient stain adaptation without access to annotated target stain data.\nRemarkably, ULSA is task agnostic in patch-level analysis for whole slide\nimages (WSIs). Through extensive evaluation on external datasets, we\ndemonstrate that ULSA achieves state-of-the-art (SOTA) performance in kidney\ntissue segmentation and breast cancer classification across a spectrum of\nstaining variations. Our findings suggest that ULSA is an important framework\nfor stain adaptation in computational pathology.\n","authors":["Daniel Reisenbüchler","Lucas Luttner","Nadine S. Schaadt","Friedrich Feuerhake","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2406.19081v2.pdf","comment":"Accepted MICCAI2024"},{"id":"http://arxiv.org/abs/2407.03018v1","updated":"2024-07-03T11:26:09Z","published":"2024-07-03T11:26:09Z","title":"An Organism Starts with a Single Pix-Cell: A Neural Cellular Diffusion\n for High-Resolution Image Synthesis","summary":" Generative modeling seeks to approximate the statistical properties of real\ndata, enabling synthesis of new data that closely resembles the original\ndistribution. Generative Adversarial Networks (GANs) and Denoising Diffusion\nProbabilistic Models (DDPMs) represent significant advancements in generative\nmodeling, drawing inspiration from game theory and thermodynamics,\nrespectively. Nevertheless, the exploration of generative modeling through the\nlens of biological evolution remains largely untapped. In this paper, we\nintroduce a novel family of models termed Generative Cellular Automata (GeCA),\ninspired by the evolution of an organism from a single cell. GeCAs are\nevaluated as an effective augmentation tool for retinal disease classification\nacross two imaging modalities: Fundus and Optical Coherence Tomography (OCT).\nIn the context of OCT imaging, where data is scarce and the distribution of\nclasses is inherently skewed, GeCA significantly boosts the performance of 11\ndifferent ophthalmological conditions, achieving a 12% increase in the average\nF1 score compared to conventional baselines. GeCAs outperform both diffusion\nmethods that incorporate UNet or state-of-the art variants with\ntransformer-based denoising models, under similar parameter constraints. Code\nis available at: https://github.com/xmed-lab/GeCA.\n","authors":["Marawan Elbatel","Konstantinos Kamnitsas","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2407.03018v1.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.03010v1","updated":"2024-07-03T11:11:16Z","published":"2024-07-03T11:11:16Z","title":"Context-Aware Video Instance Segmentation","summary":" In this paper, we introduce the Context-Aware Video Instance Segmentation\n(CAVIS), a novel framework designed to enhance instance association by\nintegrating contextual information adjacent to each object. To efficiently\nextract and leverage this information, we propose the Context-Aware Instance\nTracker (CAIT), which merges contextual data surrounding the instances with the\ncore instance features to improve tracking accuracy. Additionally, we introduce\nthe Prototypical Cross-frame Contrastive (PCC) loss, which ensures consistency\nin object-level features across frames, thereby significantly enhancing\ninstance matching accuracy. CAVIS demonstrates superior performance over\nstate-of-the-art methods on all benchmark datasets in video instance\nsegmentation (VIS) and video panoptic segmentation (VPS). Notably, our method\nexcels on the OVIS dataset, which is known for its particularly challenging\nvideos.\n","authors":["Seunghun Lee","Jiwan Seo","Kiljoon Han","Minwoo Choi","Sunghoon Im"],"pdf_url":"https://arxiv.org/pdf/2407.03010v1.pdf","comment":"Project page: https://seung-hun-lee.github.io/projects/CAVIS/"},{"id":"http://arxiv.org/abs/2407.03009v1","updated":"2024-07-03T11:10:23Z","published":"2024-07-03T11:10:23Z","title":"Model Guidance via Explanations Turns Image Classifiers into\n Segmentation Models","summary":" Heatmaps generated on inputs of image classification networks via explainable\nAI methods like Grad-CAM and LRP have been observed to resemble segmentations\nof input images in many cases. Consequently, heatmaps have also been leveraged\nfor achieving weakly supervised segmentation with image-level supervision. On\nthe other hand, losses can be imposed on differentiable heatmaps, which has\nbeen shown to serve for (1)~improving heatmaps to be more human-interpretable,\n(2)~regularization of networks towards better generalization, (3)~training\ndiverse ensembles of networks, and (4)~for explicitly ignoring confounding\ninput features. Due to the latter use case, the paradigm of imposing losses on\nheatmaps is often referred to as \"Right for the right reasons\". We unify these\ntwo lines of research by investigating semi-supervised segmentation as a novel\nuse case for the Right for the Right Reasons paradigm. First, we show formal\nparallels between differentiable heatmap architectures and standard\nencoder-decoder architectures for image segmentation. Second, we show that such\ndifferentiable heatmap architectures yield competitive results when trained\nwith standard segmentation losses. Third, we show that such architectures allow\nfor training with weak supervision in the form of image-level labels and small\nnumbers of pixel-level labels, outperforming comparable encoder-decoder models.\nCode is available at \\url{https://github.com/Kainmueller-Lab/TW-autoencoder}.\n","authors":["Xiaoyan Yu","Jannik Franzen","Wojciech Samek","Marina M. -C. Höhne","Dagmar Kainmueller"],"pdf_url":"https://arxiv.org/pdf/2407.03009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03008v1","updated":"2024-07-03T11:07:21Z","published":"2024-07-03T11:07:21Z","title":"Align and Aggregate: Compositional Reasoning with Video Alignment and\n Answer Aggregation for Video Question-Answering","summary":" Despite the recent progress made in Video Question-Answering (VideoQA), these\nmethods typically function as black-boxes, making it difficult to understand\ntheir reasoning processes and perform consistent compositional reasoning. To\naddress these challenges, we propose a \\textit{model-agnostic} Video Alignment\nand Answer Aggregation (VA$^{3}$) framework, which is capable of enhancing both\ncompositional consistency and accuracy of existing VidQA methods by integrating\nvideo aligner and answer aggregator modules. The video aligner hierarchically\nselects the relevant video clips based on the question, while the answer\naggregator deduces the answer to the question based on its sub-questions, with\ncompositional consistency ensured by the information flow along question\ndecomposition graph and the contrastive learning strategy. We evaluate our\nframework on three settings of the AGQA-Decomp dataset with three baseline\nmethods, and propose new metrics to measure the compositional consistency of\nVidQA methods more comprehensively. Moreover, we propose a large language model\n(LLM) based automatic question decomposition pipeline to apply our framework to\nany VidQA dataset. We extend MSVD and NExT-QA datasets with it to evaluate our\nVA$^3$ framework on broader scenarios. Extensive experiments show that our\nframework improves both compositional consistency and accuracy of existing\nmethods, leading to more interpretable real-world VidQA models.\n","authors":["Zhaohe Liao","Jiangtong Li","Li Niu","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03008v1.pdf","comment":"10 pages,CVPR"},{"id":"http://arxiv.org/abs/2407.03006v1","updated":"2024-07-03T11:05:19Z","published":"2024-07-03T11:05:19Z","title":"Frequency-Controlled Diffusion Model for Versatile Text-Guided\n Image-to-Image Translation","summary":" Recently, large-scale text-to-image (T2I) diffusion models have emerged as a\npowerful tool for image-to-image translation (I2I), allowing open-domain image\ntranslation via user-provided text prompts. This paper proposes\nfrequency-controlled diffusion model (FCDiffusion), an end-to-end\ndiffusion-based framework that contributes a novel solution to text-guided I2I\nfrom a frequency-domain perspective. At the heart of our framework is a\nfeature-space frequency-domain filtering module based on Discrete Cosine\nTransform, which filters the latent features of the source image in the DCT\ndomain, yielding filtered image features bearing different DCT spectral bands\nas different control signals to the pre-trained Latent Diffusion Model. We\nreveal that control signals of different DCT spectral bands bridge the source\nimage and the T2I generated image in different correlations (e.g., style,\nstructure, layout, contour, etc.), and thus enable versatile I2I applications\nemphasizing different I2I correlations, including style-guided content\ncreation, image semantic manipulation, image scene translation, and image style\ntranslation. Different from related approaches, FCDiffusion establishes a\nunified text-guided I2I framework suitable for diverse image translation tasks\nsimply by switching among different frequency control branches at inference\ntime. The effectiveness and superiority of our method for text-guided I2I are\ndemonstrated with extensive experiments both qualitatively and quantitatively.\nThe code is publicly available at: https://github.com/XiangGao1102/FCDiffusion.\n","authors":["Xiang Gao","Zhengbo Xu","Junhan Zhao","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2407.03006v1.pdf","comment":"Proceedings of the 38th AAAI Conference on Artificial Intelligence\n (AAAI 2024)"},{"id":"http://arxiv.org/abs/2407.03000v1","updated":"2024-07-03T10:59:06Z","published":"2024-07-03T10:59:06Z","title":"VIVA: A Benchmark for Vision-Grounded Decision-Making with Human Values","summary":" This paper introduces VIVA, a benchmark for VIsion-grounded decision-making\ndriven by human VAlues. While most large vision-language models (VLMs) focus on\nphysical-level skills, our work is the first to examine their multimodal\ncapabilities in leveraging human values to make decisions under a\nvision-depicted situation. VIVA contains 1,062 images depicting diverse\nreal-world situations and the manually annotated decisions grounded in them.\nGiven an image there, the model should select the most appropriate action to\naddress the situation and provide the relevant human values and reason\nunderlying the decision. Extensive experiments based on VIVA show the\nlimitation of VLMs in using human values to make multimodal decisions. Further\nanalyses indicate the potential benefits of exploiting action consequences and\npredicted human values.\n","authors":["Zhe Hu","Yixiao Ren","Jing Li","Yu Yin"],"pdf_url":"https://arxiv.org/pdf/2407.03000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00285v2","updated":"2024-07-03T10:51:56Z","published":"2023-11-01T04:36:18Z","title":"Mixture-of-Experts for Open Set Domain Adaptation: A Dual-Space\n Detection Approach","summary":" Open Set Domain Adaptation (OSDA) aims to cope with the distribution and\nlabel shifts between the source and target domains simultaneously, performing\naccurate classification for known classes while identifying unknown class\nsamples in the target domain. Most existing OSDA approaches, depending on the\nfinal image feature space of deep models, require manually-tuned thresholds,\nand may easily misclassify unknown samples as known classes. Mixture-of-Experts\n(MoE) could be a remedy. Within a MoE, different experts handle distinct input\nfeatures, producing unique expert routing patterns for various classes in a\nrouting feature space. As a result, unknown class samples may display different\nexpert routing patterns to known classes. In this paper, we propose Dual-Space\nDetection, which exploits the inconsistencies between the image feature space\nand the routing feature space to detect unknown class samples without any\nthreshold. Graph Router is further introduced to better make use of the spatial\ninformation among image patches. Experiments on three different datasets\nvalidated the effectiveness and superiority of our approach.\n","authors":["Zhenbang Du","Jiayu An","Yunlu Tu","Jiahao Hong","Dongrui Wu"],"pdf_url":"https://arxiv.org/pdf/2311.00285v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.02990v1","updated":"2024-07-03T10:42:09Z","published":"2024-07-03T10:42:09Z","title":"Graph and Skipped Transformer: Exploiting Spatial and Temporal Modeling\n Capacities for Efficient 3D Human Pose Estimation","summary":" In recent years, 2D-to-3D pose uplifting in monocular 3D Human Pose\nEstimation (HPE) has attracted widespread research interest. GNN-based methods\nand Transformer-based methods have become mainstream architectures due to their\nadvanced spatial and temporal feature learning capacities. However, existing\napproaches typically construct joint-wise and frame-wise attention alignments\nin spatial and temporal domains, resulting in dense connections that introduce\nconsiderable local redundancy and computational overhead. In this paper, we\ntake a global approach to exploit spatio-temporal information and realise\nefficient 3D HPE with a concise Graph and Skipped Transformer architecture.\nSpecifically, in Spatial Encoding stage, coarse-grained body parts are deployed\nto construct Spatial Graph Network with a fully data-driven adaptive topology,\nensuring model flexibility and generalizability across various poses. In\nTemporal Encoding and Decoding stages, a simple yet effective Skipped\nTransformer is proposed to capture long-range temporal dependencies and\nimplement hierarchical feature aggregation. A straightforward Data Rolling\nstrategy is also developed to introduce dynamic information into 2D pose\nsequence. Extensive experiments are conducted on Human3.6M, MPI-INF-3DHP and\nHuman-Eva benchmarks. G-SFormer series methods achieve superior performances\ncompared with previous state-of-the-arts with only around ten percent of\nparameters and significantly reduced computational complexity. Additionally,\nG-SFormer also exhibits outstanding robustness to inaccuracies in detected 2D\nposes.\n","authors":["Mengmeng Cui","Kunbo Zhang","Zhenan Sun"],"pdf_url":"https://arxiv.org/pdf/2407.02990v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.02988v1","updated":"2024-07-03T10:40:20Z","published":"2024-07-03T10:40:20Z","title":"YOLOv5, YOLOv8 and YOLOv10: The Go-To Detectors for Real-time Vision","summary":" This paper presents a comprehensive review of the evolution of the YOLO (You\nOnly Look Once) object detection algorithm, focusing on YOLOv5, YOLOv8, and\nYOLOv10. We analyze the architectural advancements, performance improvements,\nand suitability for edge deployment across these versions. YOLOv5 introduced\nsignificant innovations such as the CSPDarknet backbone and Mosaic\nAugmentation, balancing speed and accuracy. YOLOv8 built upon this foundation\nwith enhanced feature extraction and anchor-free detection, improving\nversatility and performance. YOLOv10 represents a leap forward with NMS-free\ntraining, spatial-channel decoupled downsampling, and large-kernel\nconvolutions, achieving state-of-the-art performance with reduced computational\noverhead. Our findings highlight the progressive enhancements in accuracy,\nefficiency, and real-time performance, particularly emphasizing their\napplicability in resource-constrained environments. This review provides\ninsights into the trade-offs between model complexity and detection accuracy,\noffering guidance for selecting the most appropriate YOLO version for specific\nedge computing applications.\n","authors":["Muhammad Hussain"],"pdf_url":"https://arxiv.org/pdf/2407.02988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02974v1","updated":"2024-07-03T10:14:33Z","published":"2024-07-03T10:14:33Z","title":"IM-MoCo: Self-supervised MRI Motion Correction using Motion-Guided\n Implicit Neural Representations","summary":" Motion artifacts in Magnetic Resonance Imaging (MRI) arise due to relatively\nlong acquisition times and can compromise the clinical utility of acquired\nimages. Traditional motion correction methods often fail to address severe\nmotion, leading to distorted and unreliable results. Deep Learning (DL)\nalleviated such pitfalls through generalization with the cost of vanishing\nstructures and hallucinations, making it challenging to apply in the medical\nfield where hallucinated structures can tremendously impact the diagnostic\noutcome. In this work, we present an instance-wise motion correction pipeline\nthat leverages motion-guided Implicit Neural Representations (INRs) to mitigate\nthe impact of motion artifacts while retaining anatomical structure. Our method\nis evaluated using the NYU fastMRI dataset with different degrees of simulated\nmotion severity. For the correction alone, we can improve over state-of-the-art\nimage reconstruction methods by $+5\\%$ SSIM, $+5\\:db$ PSNR, and $+14\\%$\nHaarPSI. Clinical relevance is demonstrated by a subsequent experiment, where\nour method improves classification outcomes by at least $+1.5$ accuracy\npercentage points compared to motion-corrupted images.\n","authors":["Ziad Al-Haj Hemidi","Christian Weihsbach","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2407.02974v1.pdf","comment":"Submitted to MICCAI 2024 (Before peer review version)"},{"id":"http://arxiv.org/abs/2407.02968v1","updated":"2024-07-03T10:04:48Z","published":"2024-07-03T10:04:48Z","title":"Unified Anomaly Detection methods on Edge Device using Knowledge\n Distillation and Quantization","summary":" With the rapid advances in deep learning and smart manufacturing in Industry\n4.0, there is an imperative for high-throughput, high-performance, and fully\nintegrated visual inspection systems. Most anomaly detection approaches using\ndefect detection datasets, such as MVTec AD, employ one-class models that\nrequire fitting separate models for each class. On the contrary, unified models\neliminate the need for fitting separate models for each class and significantly\nreduce cost and memory requirements. Thus, in this work, we experiment with\nconsidering a unified multi-class setup. Our experimental study shows that\nmulti-class models perform at par with one-class models for the standard MVTec\nAD dataset. Hence, this indicates that there may not be a need to learn\nseparate object/class-wise models when the object classes are significantly\ndifferent from each other, as is the case of the dataset considered.\nFurthermore, we have deployed three different unified lightweight architectures\non the CPU and an edge device (NVIDIA Jetson Xavier NX). We analyze the\nquantized multi-class anomaly detection models in terms of latency and memory\nrequirements for deployment on the edge device while comparing\nquantization-aware training (QAT) and post-training quantization (PTQ) for\nperformance at different precision widths. In addition, we explored two\ndifferent methods of calibration required in post-training scenarios and show\nthat one of them performs notably better, highlighting its importance for\nunsupervised tasks. Due to quantization, the performance drop in PTQ is further\ncompensated by QAT, which yields at par performance with the original 32-bit\nFloating point in two of the models considered.\n","authors":["Sushovan Jena","Arya Pulkit","Kajal Singh","Anoushka Banerjee","Sharad Joshi","Ananth Ganesh","Dinesh Singh","Arnav Bhavsar"],"pdf_url":"https://arxiv.org/pdf/2407.02968v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2407.02159v2","updated":"2024-07-03T10:00:34Z","published":"2024-07-02T11:08:51Z","title":"SparseSSP: 3D Subcellular Structure Prediction from Sparse-View\n Transmitted Light Images","summary":" Traditional fluorescence staining is phototoxic to live cells, slow, and\nexpensive; thus, the subcellular structure prediction (SSP) from transmitted\nlight (TL) images is emerging as a label-free, faster, low-cost alternative.\nHowever, existing approaches utilize 3D networks for one-to-one voxel level\ndense prediction, which necessitates a frequent and time-consuming Z-axis\nimaging process. Moreover, 3D convolutions inevitably lead to significant\ncomputation and GPU memory overhead. Therefore, we propose an efficient\nframework, SparseSSP, predicting fluorescent intensities within the target\nvoxel grid in an efficient paradigm instead of relying entirely on 3D\ntopologies. In particular, SparseSSP makes two pivotal improvements to prior\nworks. First, SparseSSP introduces a one-to-many voxel mapping paradigm, which\npermits the sparse TL slices to reconstruct the subcellular structure.\nSecondly, we propose a hybrid dimensions topology, which folds the Z-axis\ninformation into channel features, enabling the 2D network layers to tackle SSP\nunder low computational cost. We conduct extensive experiments to validate the\neffectiveness and advantages of SparseSSP on diverse sparse imaging ratios, and\nour approach achieves a leading performance compared to pure 3D topologies.\nSparseSSP reduces imaging frequencies compared to previous dense-view SSP\n(i.e., the number of imaging is reduced up to 87.5% at most), which is\nsignificant in visualizing rapid biological dynamics on low-cost devices and\nsamples.\n","authors":["Jintu Zheng","Yi Ding","Qizhe Liu","Yi Cao","Ying Hu","Zenan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02159v2.pdf","comment":"Accpeted to ECCV2024"},{"id":"http://arxiv.org/abs/2407.02946v1","updated":"2024-07-03T09:29:46Z","published":"2024-07-03T09:29:46Z","title":"3D Multimodal Image Registration for Plant Phenotyping","summary":" The use of multiple camera technologies in a combined multimodal monitoring\nsystem for plant phenotyping offers promising benefits. Compared to\nconfigurations that only utilize a single camera technology, cross-modal\npatterns can be recorded that allow a more comprehensive assessment of plant\nphenotypes. However, the effective utilization of cross-modal patterns is\ndependent on precise image registration to achieve pixel-accurate alignment, a\nchallenge often complicated by parallax and occlusion effects inherent in plant\ncanopy imaging.\n In this study, we propose a novel multimodal 3D image registration method\nthat addresses these challenges by integrating depth information from a\ntime-of-flight camera into the registration process. By leveraging depth data,\nour method mitigates parallax effects and thus facilitates more accurate pixel\nalignment across camera modalities. Additionally, we introduce an automated\nmechanism to identify and differentiate different types of occlusions, thereby\nminimizing the introduction of registration errors.\n To evaluate the efficacy of our approach, we conduct experiments on a diverse\nimage dataset comprising six distinct plant species with varying leaf\ngeometries. Our results demonstrate the robustness of the proposed registration\nalgorithm, showcasing its ability to achieve accurate alignment across\ndifferent plant types and camera compositions. Compared to previous methods it\nis not reliant on detecting plant specific image features and can thereby be\nutilized for a wide variety of applications in plant sciences. The registration\napproach principally scales to arbitrary numbers of cameras with different\nresolutions and wavelengths. Overall, our study contributes to advancing the\nfield of plant phenotyping by offering a robust and reliable solution for\nmultimodal image registration.\n","authors":["Eric Stumpe","Gernot Bodner","Francesco Flagiello","Matthias Zeppelzauer"],"pdf_url":"https://arxiv.org/pdf/2407.02946v1.pdf","comment":"53 pages, 13 Figures, preprint submitted to Computers and Electronics\n in Agriculture"},{"id":"http://arxiv.org/abs/2311.17081v2","updated":"2024-07-03T09:23:57Z","published":"2023-11-28T00:43:52Z","title":"I-MedSAM: Implicit Medical Image Segmentation with Segment Anything","summary":" With the development of Deep Neural Networks (DNNs), many efforts have been\nmade to handle medical image segmentation. Traditional methods such as nnUNet\ntrain specific segmentation models on the individual datasets. Plenty of recent\nmethods have been proposed to adapt the foundational Segment Anything Model\n(SAM) to medical image segmentation. However, they still focus on discrete\nrepresentations to generate pixel-wise predictions, which are spatially\ninflexible and scale poorly to higher resolution. In contrast, implicit methods\nlearn continuous representations for segmentation, which is crucial for medical\nimage segmentation. In this paper, we propose I-MedSAM, which leverages the\nbenefits of both continuous representations and SAM, to obtain better\ncross-domain ability and accurate boundary delineation. Since medical image\nsegmentation needs to predict detailed segmentation boundaries, we designed a\nnovel adapter to enhance the SAM features with high-frequency information\nduring Parameter-Efficient Fine-Tuning (PEFT). To convert the SAM features and\ncoordinates into continuous segmentation output, we utilize Implicit Neural\nRepresentation (INR) to learn an implicit segmentation decoder. We also propose\nan uncertainty-guided sampling strategy for efficient learning of INR.\nExtensive evaluations on 2D medical image segmentation tasks have shown that\nour proposed method with only 1.6M trainable parameters outperforms existing\nmethods including discrete and implicit methods. The code will be available at:\nhttps://github.com/ucwxb/I-MedSAM.\n","authors":["Xiaobao Wei","Jiajun Cao","Yizhu Jin","Ming Lu","Guangyu Wang","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.17081v2.pdf","comment":"accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2407.02945v1","updated":"2024-07-03T09:23:13Z","published":"2024-07-03T09:23:13Z","title":"VEGS: View Extrapolation of Urban Scenes in 3D Gaussian Splatting using\n Learned Priors","summary":" Neural rendering-based urban scene reconstruction methods commonly rely on\nimages collected from driving vehicles with cameras facing and moving forward.\nAlthough these methods can successfully synthesize from views similar to\ntraining camera trajectory, directing the novel view outside the training\ncamera distribution does not guarantee on-par performance. In this paper, we\ntackle the Extrapolated View Synthesis (EVS) problem by evaluating the\nreconstructions on views such as looking left, right or downwards with respect\nto training camera distributions. To improve rendering quality for EVS, we\ninitialize our model by constructing dense LiDAR map, and propose to leverage\nprior scene knowledge such as surface normal estimator and large-scale\ndiffusion model. Qualitative and quantitative comparisons demonstrate the\neffectiveness of our methods on EVS. To the best of our knowledge, we are the\nfirst to address the EVS problem in urban scene reconstruction. Link to our\nproject page: https://vegs3d.github.io/.\n","authors":["Sungwon Hwang","Min-Jung Kim","Taewoong Kang","Jayeon Kang","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2407.02945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02942v1","updated":"2024-07-03T09:19:35Z","published":"2024-07-03T09:19:35Z","title":"Recompression Based JPEG Tamper Detection and Localization Using Deep\n Neural Network Eliminating Compression Factor Dependency","summary":" In this work, we deal with the problem of re compression based image forgery\ndetection, where some regions of an image are modified illegitimately, hence\ngiving rise to presence of dual compression characteristics within a single\nimage. There have been some significant researches in this direction, in the\nlast decade. However, almost all existing techniques fail to detect this form\nof forgery, when the first compression factor is greater than the second. We\naddress this problem in re compression based forgery detection, here Recently,\nMachine Learning techniques have started gaining a lot of importance in the\ndomain of digital image forensics. In this work, we propose a Convolution\nNeural Network based deep learning architecture, which is capable of detecting\nthe presence of re compression based forgery in JPEG images. The proposed\narchitecture works equally efficiently, even in cases where the first\ncompression ratio is greater than the second. In this work, we also aim to\nlocalize the regions of image manipulation based on re compression features,\nusing the trained neural network. Our experimental results prove that the\nproposed method outperforms the state of the art, with respect to forgery\ndetection and localization accuracy.\n","authors":["Jamimamul Bakas","Praneta Rawat","Kalyan Kokkalla","Ruchira Naskar"],"pdf_url":"https://arxiv.org/pdf/2407.02942v1.pdf","comment":"24 pages, conference"},{"id":"http://arxiv.org/abs/2310.17316v3","updated":"2024-07-03T09:11:54Z","published":"2023-10-26T11:23:24Z","title":"Defect Spectrum: A Granular Look of Large-Scale Defect Datasets with\n Rich Semantics","summary":" Defect inspection is paramount within the closed-loop manufacturing system.\nHowever, existing datasets for defect inspection often lack precision and\nsemantic granularity required for practical applications. In this paper, we\nintroduce the Defect Spectrum, a comprehensive benchmark that offers precise,\nsemantic-abundant, and large-scale annotations for a wide range of industrial\ndefects. Building on four key industrial benchmarks, our dataset refines\nexisting annotations and introduces rich semantic details, distinguishing\nmultiple defect types within a single image. Furthermore, we introduce\nDefect-Gen, a two-stage diffusion-based generator designed to create\nhigh-quality and diverse defective images, even when working with limited\ndatasets. The synthetic images generated by Defect-Gen significantly enhance\nthe efficacy of defect inspection models. Overall, The Defect Spectrum dataset\ndemonstrates its potential in defect inspection research, offering a solid\nplatform for testing and refining advanced models.\n","authors":["Shuai Yang","Zhifei Chen","Pengguang Chen","Xi Fang","Shu Liu","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2310.17316v3.pdf","comment":"Accepted by ECCV2024. Please see our project page at\n https://envision-research.github.io/Defect_Spectrum/"},{"id":"http://arxiv.org/abs/2407.02934v1","updated":"2024-07-03T09:07:14Z","published":"2024-07-03T09:07:14Z","title":"PosMLP-Video: Spatial and Temporal Relative Position Encoding for\n Efficient Video Recognition","summary":" In recent years, vision Transformers and MLPs have demonstrated remarkable\nperformance in image understanding tasks. However, their inherently dense\ncomputational operators, such as self-attention and token-mixing layers, pose\nsignificant challenges when applied to spatio-temporal video data. To address\nthis gap, we propose PosMLP-Video, a lightweight yet powerful MLP-like backbone\nfor video recognition. Instead of dense operators, we use efficient relative\npositional encoding (RPE) to build pairwise token relations, leveraging\nsmall-sized parameterized relative position biases to obtain each relation\nscore. Specifically, to enable spatio-temporal modeling, we extend the image\nPosMLP's positional gating unit to temporal, spatial, and spatio-temporal\nvariants, namely PoTGU, PoSGU, and PoSTGU, respectively. These gating units can\nbe feasibly combined into three types of spatio-temporal factorized positional\nMLP blocks, which not only decrease model complexity but also maintain good\nperformance. Additionally, we enrich relative positional relationships by using\nchannel grouping. Experimental results on three video-related tasks demonstrate\nthat PosMLP-Video achieves competitive speed-accuracy trade-offs compared to\nthe previous state-of-the-art models. In particular, PosMLP-Video pre-trained\non ImageNet1K achieves 59.0%/70.3% top-1 accuracy on Something-Something V1/V2\nand 82.1% top-1 accuracy on Kinetics-400 while requiring much fewer parameters\nand FLOPs than other models. The code is released at\nhttps://github.com/zhouds1918/PosMLP_Video.\n","authors":["Yanbin Hao","Diansong Zhou","Zhicai Wang","Chong-Wah Ngo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02926v1","updated":"2024-07-03T08:57:35Z","published":"2024-07-03T08:57:35Z","title":"Explainable vertebral fracture analysis with uncertainty estimation\n using differentiable rule-based classification","summary":" We present a novel method for explainable vertebral fracture assessment\n(XVFA) in low-dose radiographs using deep neural networks, incorporating\nvertebra detection and keypoint localization with uncertainty estimates. We\nincorporate Genant's semi-quantitative criteria as a differentiable rule-based\nmeans of classifying both vertebra fracture grade and morphology. Unlike\nprevious work, XVFA provides explainable classifications relatable to current\nclinical methodology, as well as uncertainty estimations, while at the same\ntime surpassing state-of-the art methods with a vertebra-level sensitivity of\n93% and end-to-end AUC of 97% in a challenging setting. Moreover, we compare\nintra-reader agreement with model uncertainty estimates, with model reliability\non par with human annotators.\n","authors":["Victor Wåhlstrand Skärström","Lisa Johansson","Jennifer Alvén","Mattias Lorentzon","Ida Häggström"],"pdf_url":"https://arxiv.org/pdf/2407.02926v1.pdf","comment":"To be published in MICCAI 2024 conference proceedings"},{"id":"http://arxiv.org/abs/2407.02920v1","updated":"2024-07-03T08:53:50Z","published":"2024-07-03T08:53:50Z","title":"EgoFlowNet: Non-Rigid Scene Flow from Point Clouds with Ego-Motion\n Support","summary":" Recent weakly-supervised methods for scene flow estimation from LiDAR point\nclouds are limited to explicit reasoning on object-level. These methods perform\nmultiple iterative optimizations for each rigid object, which makes them\nvulnerable to clustering robustness. In this paper, we propose our EgoFlowNet -\na point-level scene flow estimation network trained in a weakly-supervised\nmanner and without object-based abstraction. Our approach predicts a binary\nsegmentation mask that implicitly drives two parallel branches for ego-motion\nand scene flow. Unlike previous methods, we provide both branches with all\ninput points and carefully integrate the binary mask into the feature\nextraction and losses. We also use a shared cost volume with local refinement\nthat is updated at multiple scales without explicit clustering or rigidity\nassumptions. On realistic KITTI scenes, we show that our EgoFlowNet performs\nbetter than state-of-the-art methods in the presence of ground surface points.\n","authors":["Ramy Battrawy","René Schuster","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2407.02920v1.pdf","comment":"This paper is published in BMVC2023 (pp. 441-443)"},{"id":"http://arxiv.org/abs/2407.02918v1","updated":"2024-07-03T08:49:35Z","published":"2024-07-03T08:49:35Z","title":"Free-SurGS: SfM-Free 3D Gaussian Splatting for Surgical Scene\n Reconstruction","summary":" Real-time 3D reconstruction of surgical scenes plays a vital role in\ncomputer-assisted surgery, holding a promise to enhance surgeons' visibility.\nRecent advancements in 3D Gaussian Splatting (3DGS) have shown great potential\nfor real-time novel view synthesis of general scenes, which relies on accurate\nposes and point clouds generated by Structure-from-Motion (SfM) for\ninitialization. However, 3DGS with SfM fails to recover accurate camera poses\nand geometry in surgical scenes due to the challenges of minimal textures and\nphotometric inconsistencies. To tackle this problem, in this paper, we propose\nthe first SfM-free 3DGS-based method for surgical scene reconstruction by\njointly optimizing the camera poses and scene representation. Based on the\nvideo continuity, the key of our method is to exploit the immediate optical\nflow priors to guide the projection flow derived from 3D Gaussians. Unlike most\nprevious methods relying on photometric loss only, we formulate the pose\nestimation problem as minimizing the flow loss between the projection flow and\noptical flow. A consistency check is further introduced to filter the flow\noutliers by detecting the rigid and reliable points that satisfy the epipolar\ngeometry. During 3D Gaussian optimization, we randomly sample frames to\noptimize the scene representations to grow the 3D Gaussian progressively.\nExperiments on the SCARED dataset demonstrate our superior performance over\nexisting methods in novel view synthesis and pose estimation with high\nefficiency. Code is available at https://github.com/wrld/Free-SurGS.\n","authors":["Jiaxin Guo","Jiangliu Wang","Di Kang","Wenzhen Dong","Wenting Wang","Yun-hui Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02918v1.pdf","comment":"Accepted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2404.12390v4","updated":"2024-07-03T08:44:45Z","published":"2024-04-18T17:59:54Z","title":"BLINK: Multimodal Large Language Models Can See but Not Perceive","summary":" We introduce Blink, a new benchmark for multimodal language models (LLMs)\nthat focuses on core visual perception abilities not found in other\nevaluations. Most of the Blink tasks can be solved by humans \"within a blink\"\n(e.g., relative depth estimation, visual correspondence, forensics detection,\nand multi-view reasoning). However, we find these perception-demanding tasks\ncast significant challenges for current multimodal LLMs because they resist\nmediation through natural language. Blink reformats 14 classic computer vision\ntasks into 3,807 multiple-choice questions, paired with single or multiple\nimages and visual prompting. While humans get 95.70% accuracy on average, Blink\nis surprisingly challenging for existing multimodal LLMs: even the\nbest-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only\n13.17% and 7.63% higher than random guessing, indicating that such perception\nabilities have not \"emerged\" yet in recent multimodal LLMs. Our analysis also\nhighlights that specialist CV models could solve these problems much better,\nsuggesting potential pathways for future improvements. We believe Blink will\nstimulate the community to help multimodal LLMs catch up with human-level\nvisual perception.\n","authors":["Xingyu Fu","Yushi Hu","Bangzheng Li","Yu Feng","Haoyu Wang","Xudong Lin","Dan Roth","Noah A. Smith","Wei-Chiu Ma","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2404.12390v4.pdf","comment":"Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/,\n ECCV 2024"},{"id":"http://arxiv.org/abs/2407.02911v1","updated":"2024-07-03T08:37:01Z","published":"2024-07-03T08:37:01Z","title":"Non-Adversarial Learning: Vector-Quantized Common Latent Space for\n Multi-Sequence MRI","summary":" Adversarial learning helps generative models translate MRI from source to\ntarget sequence when lacking paired samples. However, implementing MRI\nsynthesis with adversarial learning in clinical settings is challenging due to\ntraining instability and mode collapse. To address this issue, we leverage\nintermediate sequences to estimate the common latent space among multi-sequence\nMRI, enabling the reconstruction of distinct sequences from the common latent\nspace. We propose a generative model that compresses discrete representations\nof each sequence to estimate the Gaussian distribution of vector-quantized\ncommon (VQC) latent space between multiple sequences. Moreover, we improve the\nlatent space consistency with contrastive learning and increase model stability\nby domain augmentation. Experiments using BraTS2021 dataset show that our\nnon-adversarial model outperforms other GAN-based methods, and VQC latent space\naids our model to achieve (1) anti-interference ability, which can eliminate\nthe effects of noise, bias fields, and artifacts, and (2) solid semantic\nrepresentation ability, with the potential of one-shot segmentation. Our code\nis publicly available.\n","authors":["Luyi Han","Tao Tan","Tianyu Zhang","Xin Wang","Yuan Gao","Chunyao Lu","Xinglong Liang","Haoran Dou","Yunzhi Huang","Ritse Mann"],"pdf_url":"https://arxiv.org/pdf/2407.02911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02910v1","updated":"2024-07-03T08:35:52Z","published":"2024-07-03T08:35:52Z","title":"Domain-independent detection of known anomalies","summary":" One persistent obstacle in industrial quality inspection is the detection of\nanomalies. In real-world use cases, two problems must be addressed: anomalous\ndata is sparse and the same types of anomalies need to be detected on\npreviously unseen objects. Current anomaly detection approaches can be trained\nwith sparse nominal data, whereas domain generalization approaches enable\ndetecting objects in previously unseen domains. Utilizing those two\nobservations, we introduce the hybrid task of domain generalization on sparse\nclasses. To introduce an accompanying dataset for this task, we present a\nmodification of the well-established MVTec AD dataset by generating three new\ndatasets. In addition to applying existing methods for benchmark, we design two\nembedding-based approaches, Spatial Embedding MLP (SEMLP) and Labeled\nPatchCore. Overall, SEMLP achieves the best performance with an average\nimage-level AUROC of 87.2 % vs. 80.4 % by MIRO. The new and openly available\ndatasets allow for further research to improve industrial anomaly detection.\n","authors":["Jonas Bühler","Jonas Fehrenbach","Lucas Steinmann","Christian Nauck","Marios Koulakis"],"pdf_url":"https://arxiv.org/pdf/2407.02910v1.pdf","comment":"Accepted as extended abstract in CVPR 2024 workshop VAND 2.0"},{"id":"http://arxiv.org/abs/2407.02370v2","updated":"2024-07-03T08:32:51Z","published":"2024-07-02T15:39:08Z","title":"Investigating Event-Based Cameras for Video Frame Interpolation in\n Sports","summary":" Slow-motion replays provide a thrilling perspective on pivotal moments within\nsports games, offering a fresh and captivating visual experience. However,\ncapturing slow-motion footage typically demands high-tech, expensive cameras\nand infrastructures. Deep learning Video Frame Interpolation (VFI) techniques\nhave emerged as a promising avenue, capable of generating high-speed footage\nfrom regular camera feeds. Moreover, the utilization of event-based cameras has\nrecently gathered attention as they provide valuable motion information between\nframes, further enhancing the VFI performances. In this work, we present a\nfirst investigation of event-based VFI models for generating sports slow-motion\nvideos. Particularly, we design and implement a bi-camera recording setup,\nincluding an RGB and an event-based camera to capture sports videos, to\ntemporally align and spatially register both cameras. Our experimental\nvalidation demonstrates that TimeLens, an off-the-shelf event-based VFI model,\ncan effectively generate slow-motion footage for sports videos. This first\ninvestigation underscores the practical utility of event-based cameras in\nproducing sports slow-motion content and lays the groundwork for future\nresearch endeavors in this domain.\n","authors":["Antoine Deckyvere","Anthony Cioppa","Silvio Giancola","Bernard Ghanem","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2407.02370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10086v2","updated":"2024-07-03T08:31:45Z","published":"2024-02-08T09:08:44Z","title":"Explainable AI for Safe and Trustworthy Autonomous Driving: A Systematic\n Review","summary":" Artificial Intelligence (AI) shows promising applications for the perception\nand planning tasks in autonomous driving (AD) due to its superior performance\ncompared to conventional methods. However, inscrutable AI systems exacerbate\nthe existing challenge of safety assurance of AD. One way to mitigate this\nchallenge is to utilize explainable AI (XAI) techniques. To this end, we\npresent the first comprehensive systematic literature review of explainable\nmethods for safe and trustworthy AD. We begin by analyzing the requirements for\nAI in the context of AD, focusing on three key aspects: data, model, and\nagency. We find that XAI is fundamental to meeting these requirements. Based on\nthis, we explain the sources of explanations in AI and describe a taxonomy of\nXAI. We then identify five key contributions of XAI for safe and trustworthy AI\nin AD, which are interpretable design, interpretable surrogate models,\ninterpretable monitoring, auxiliary explanations, and interpretable validation.\nFinally, we propose a modular framework called SafeX to integrate these\ncontributions, enabling explanation delivery to users while simultaneously\nensuring the safety of AI models.\n","authors":["Anton Kuznietsov","Balint Gyevnar","Cheng Wang","Steven Peters","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2402.10086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02906v1","updated":"2024-07-03T08:25:02Z","published":"2024-07-03T08:25:02Z","title":"Single Image Rolling Shutter Removal with Diffusion Models","summary":" We present RS-Diffusion, the first Diffusion Models-based method for\nsingle-frame Rolling Shutter (RS) correction. RS artifacts compromise visual\nquality of frames due to the row wise exposure of CMOS sensors. Most previous\nmethods have focused on multi-frame approaches, using temporal information from\nconsecutive frames for the motion rectification. However, few approaches\naddress the more challenging but important single frame RS correction. In this\nwork, we present an ``image-to-motion'' framework via diffusion techniques,\nwith a designed patch-attention module. In addition, we present the RS-Real\ndataset, comprised of captured RS frames alongside their corresponding Global\nShutter (GS) ground-truth pairs. The GS frames are corrected from the RS ones,\nguided by the corresponding Inertial Measurement Unit (IMU) gyroscope data\nacquired during capture. Experiments show that our RS-Diffusion surpasses\nprevious single RS correction methods. Our method and proposed RS-Real dataset\nlay a solid foundation for advancing the field of RS correction.\n","authors":["Zhanglei Yang","Haipeng Li","Mingbo Hong","Bing Zeng","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02900v1","updated":"2024-07-03T08:20:27Z","published":"2024-07-03T08:20:27Z","title":"Self-supervised Vision Transformer are Scalable Generative Models for\n Domain Generalization","summary":" Despite notable advancements, the integration of deep learning (DL)\ntechniques into impactful clinical applications, particularly in the realm of\ndigital histopathology, has been hindered by challenges associated with\nachieving robust generalization across diverse imaging domains and\ncharacteristics. Traditional mitigation strategies in this field such as data\naugmentation and stain color normalization have proven insufficient in\naddressing this limitation, necessitating the exploration of alternative\nmethodologies. To this end, we propose a novel generative method for domain\ngeneralization in histopathology images. Our method employs a generative,\nself-supervised Vision Transformer to dynamically extract characteristics of\nimage patches and seamlessly infuse them into the original images, thereby\ncreating novel, synthetic images with diverse attributes. By enriching the\ndataset with such synthesized images, we aim to enhance its holistic nature,\nfacilitating improved generalization of DL models to unseen domains. Extensive\nexperiments conducted on two distinct histopathology datasets demonstrate the\neffectiveness of our proposed approach, outperforming the state of the art\nsubstantially, on the Camelyon17-wilds challenge dataset (+2%) and on a second\nepithelium-stroma dataset (+26%). Furthermore, we emphasize our method's\nability to readily scale with increasingly available unlabeled data samples and\nmore complex, higher parametric architectures. Source code is available at\nhttps://github.com/sdoerrich97/vits-are-generative-models .\n","authors":["Sebastian Doerrich","Francesco Di Salvo","Christian Ledig"],"pdf_url":"https://arxiv.org/pdf/2407.02900v1.pdf","comment":"Accepted at MICCAI 2024. This is the submitted manuscript with added\n link to github repo and funding acknowledgements. No further post submission\n improvements or corrections were integrated. Final version not published yet"},{"id":"http://arxiv.org/abs/2407.02893v1","updated":"2024-07-03T08:13:16Z","published":"2024-07-03T08:13:16Z","title":"An Uncertainty-guided Tiered Self-training Framework for Active\n Source-free Domain Adaptation in Prostate Segmentation","summary":" Deep learning models have exhibited remarkable efficacy in accurately\ndelineating the prostate for diagnosis and treatment of prostate diseases, but\nchallenges persist in achieving robust generalization across different medical\ncenters. Source-free Domain Adaptation (SFDA) is a promising technique to adapt\ndeep segmentation models to address privacy and security concerns while\nreducing domain shifts between source and target domains. However, recent\nliterature indicates that the performance of SFDA remains far from satisfactory\ndue to unpredictable domain gaps. Annotating a few target domain samples is\nacceptable, as it can lead to significant performance improvement with a low\nannotation cost. Nevertheless, due to extremely limited annotation budgets,\ncareful consideration is needed in selecting samples for annotation. Inspired\nby this, our goal is to develop Active Source-free Domain Adaptation (ASFDA)\nfor medical image segmentation. Specifically, we propose a novel\nUncertainty-guided Tiered Self-training (UGTST) framework, consisting of\nefficient active sample selection via entropy-based primary local peak\nfiltering to aggregate global uncertainty and diversity-aware redundancy\nfilter, coupled with a tiered self-learning strategy, achieves stable domain\nadaptation. Experimental results on cross-center prostate MRI segmentation\ndatasets revealed that our method yielded marked advancements, with a mere 5%\nannotation, exhibiting an average Dice score enhancement of 9.78% and 7.58% in\ntwo target domains compared with state-of-the-art methods, on par with fully\nsupervised learning. Code is available at:https://github.com/HiLab-git/UGTST\n","authors":["Zihao Luo","Xiangde Luo","Zijun Gao","Guotai Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02893v1.pdf","comment":"11 pages, 3 figures, 2 tables, accept to MICCAI 2024"},{"id":"http://arxiv.org/abs/2210.01302v3","updated":"2024-07-03T08:06:56Z","published":"2022-10-04T01:40:31Z","title":"Nuisances via Negativa: Adjusting for Spurious Correlations via Data\n Augmentation","summary":" In prediction tasks, there exist features that are related to the label in\nthe same way across different settings for that task; these are semantic\nfeatures or semantics. Features with varying relationships to the label are\nnuisances. For example, in detecting cows from natural images, the shape of the\nhead is semantic but because images of cows often have grass backgrounds but\nnot always, the background is a nuisance. Models that exploit nuisance-label\nrelationships face performance degradation when these relationships change.\nBuilding models robust to such changes requires additional knowledge beyond\nsamples of the features and labels. For example, existing work uses annotations\nof nuisances or assumes ERM-trained models depend on nuisances. Approaches to\nintegrate new kinds of additional knowledge enlarge the settings where robust\nmodels can be built. We develop an approach to use knowledge about the\nsemantics by corrupting them in data, and then using the corrupted data to\nproduce models which identify correlations between nuisances and the label.\nOnce these correlations are identified, they can be used to adjust for where\nnuisances drive predictions. We study semantic corruptions in powering\ndifferent spurious-correlation avoiding methods on multiple out-of-distribution\n(OOD) tasks like classifying waterbirds, natural language inference (NLI), and\ndetecting cardiomegaly in chest X-rays.\n","authors":["Aahlad Puli","Nitish Joshi","Yoav Wald","He He","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2210.01302v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02887v1","updated":"2024-07-03T08:03:56Z","published":"2024-07-03T08:03:56Z","title":"Explicitly Guided Information Interaction Network for Cross-modal Point\n Cloud Completion","summary":" Corresponding author}In this paper, we explore a novel framework, EGIInet\n(Explicitly Guided Information Interaction Network), a model for View-guided\nPoint cloud Completion (ViPC) task, which aims to restore a complete point\ncloud from a partial one with a single view image. In comparison with previous\nmethods that relied on the global semantics of input images, EGIInet\nefficiently combines the information from two modalities by leveraging the\ngeometric nature of the completion task. Specifically, we propose an explicitly\nguided information interaction strategy supported by modal alignment for point\ncloud completion. First, in contrast to previous methods which simply use 2D\nand 3D backbones to encode features respectively, we unified the encoding\nprocess to promote modal alignment. Second, we propose a novel explicitly\nguided information interaction strategy that could help the network identify\ncritical information within images, thus achieving better guidance for\ncompletion. Extensive experiments demonstrate the effectiveness of our\nframework, and we achieved a new state-of-the-art (+16\\% CD over XMFnet) in\nbenchmark datasets despite using fewer parameters than the previous methods.\nThe pre-trained model and code and are available at\nhttps://github.com/WHU-USI3DV/EGIInet.\n","authors":["Hang Xu","Chen Long","Wenxiao Zhang","Yuan Liu","Zhen Cao","Zhen Dong","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02887v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2406.09410v3","updated":"2024-07-03T08:00:31Z","published":"2024-06-13T17:59:51Z","title":"STAR: A First-Ever Dataset and A Large-Scale Benchmark for Scene Graph\n Generation in Large-Size Satellite Imagery","summary":" Scene graph generation (SGG) in satellite imagery (SAI) benefits promoting\nunderstanding of geospatial scenarios from perception to cognition. In SAI,\nobjects exhibit great variations in scales and aspect ratios, and there exist\nrich relationships between objects (even between spatially disjoint objects),\nwhich makes it attractive to holistically conduct SGG in large-size\nvery-high-resolution (VHR) SAI. However, there lack such SGG datasets. Due to\nthe complexity of large-size SAI, mining triplets heavily relies on long-range contextual reasoning. Consequently, SGG\nmodels designed for small-size natural imagery are not directly applicable to\nlarge-size SAI. This paper constructs a large-scale dataset for SGG in\nlarge-size VHR SAI with image sizes ranging from 512 x 768 to 27,860 x 31,096\npixels, named STAR (Scene graph generaTion in lArge-size satellite imageRy),\nencompassing over 210K objects and over 400K triplets. To realize SGG in\nlarge-size SAI, we propose a context-aware cascade cognition (CAC) framework to\nunderstand SAI regarding object detection (OBD), pair pruning and relationship\nprediction for SGG. We also release a SAI-oriented SGG toolkit with about 30\nOBD and 10 SGG methods which need further adaptation by our devised modules on\nour challenging STAR dataset. The dataset and toolkit are available at:\nhttps://linlin-dev.github.io/project/STAR.\n","authors":["Yansheng Li","Linlin Wang","Tingzhu Wang","Xue Yang","Junwei Luo","Qi Wang","Youming Deng","Wenbin Wang","Xian Sun","Haifeng Li","Bo Dang","Yongjun Zhang","Yi Yu","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2406.09410v3.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2406.20076v2","updated":"2024-07-03T07:59:52Z","published":"2024-06-28T17:38:18Z","title":"EVF-SAM: Early Vision-Language Fusion for Text-Prompted Segment Anything\n Model","summary":" Segment Anything Model (SAM) has attracted widespread attention for its\nsuperior interactive segmentation capabilities with visual prompts while\nlacking further exploration of text prompts. In this paper, we empirically\ninvestigate what text prompt encoders (e.g., CLIP or LLM) are good for adapting\nSAM for referring expression segmentation and introduce the Early\nVision-language Fusion-based SAM (EVF-SAM). EVF-SAM is a simple yet effective\nreferring segmentation method which exploits multimodal prompts (i.e., image\nand text) and comprises a pre-trained vision-language model to generate\nreferring prompts and a SAM model for segmentation. Surprisingly, we observe\nthat: (1) multimodal prompts and (2) vision-language models with early fusion\n(e.g., BEIT-3) are beneficial for prompting SAM for accurate referring\nsegmentation. Our experiments show that the proposed EVF-SAM based on BEIT-3\ncan obtain state-of-the-art performance on RefCOCO/+/g for referring expression\nsegmentation and demonstrate the superiority of prompting SAM with early\nvision-language fusion. In addition, the proposed EVF-SAM with 1.32B parameters\nachieves remarkably higher performance while reducing nearly 82% of parameters\ncompared to previous SAM methods based on large multimodal models.\n","authors":["Yuxuan Zhang","Tianheng Cheng","Rui Hu","Lei Liu","Heng Liu","Longjin Ran","Xiaoxin Chen","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2406.20076v2.pdf","comment":"Preprint. Code and models are available at:\n https://github.com/hustvl/EVF-SAM"},{"id":"http://arxiv.org/abs/2401.08847v2","updated":"2024-07-03T07:57:53Z","published":"2024-01-16T21:45:08Z","title":"RIDGE: Reproducibility, Integrity, Dependability, Generalizability, and\n Efficiency Assessment of Medical Image Segmentation Models","summary":" Deep learning techniques hold immense promise for advancing medical image\nanalysis, particularly in tasks like image segmentation, where precise\nannotation of regions or volumes of interest within medical images is crucial\nbut manually laborious and prone to interobserver and intraobserver biases. As\nsuch, deep learning approaches could provide automated solutions for such\napplications. However, the potential of these techniques is often undermined by\nchallenges in reproducibility and generalizability, which are key barriers to\ntheir clinical adoption. This paper introduces the RIDGE checklist, a\ncomprehensive framework designed to assess the Reproducibility, Integrity,\nDependability, Generalizability, and Efficiency of deep learning-based medical\nimage segmentation models. The RIDGE checklist is not just a tool for\nevaluation but also a guideline for researchers striving to improve the quality\nand transparency of their work. By adhering to the principles outlined in the\nRIDGE checklist, researchers can ensure that their developed segmentation\nmodels are robust, scientifically valid, and applicable in a clinical setting.\n","authors":["Farhad Maleki","Linda Moy","Reza Forghani","Tapotosh Ghosh","Katie Ovens","Steve Langer","Pouria Rouzrokh","Bardia Khosravi","Ali Ganjizadeh","Daniel Warren","Roxana Daneshjou","Mana Moassefi","Atlas Haddadi Avval","Susan Sotardi","Neil Tenenholtz","Felipe Kitamura","Timothy Kline"],"pdf_url":"https://arxiv.org/pdf/2401.08847v2.pdf","comment":"24 pages, 1 Figure, 2 Table"},{"id":"http://arxiv.org/abs/2307.02129v5","updated":"2024-07-03T07:57:00Z","published":"2023-07-05T09:11:09Z","title":"How Deep Neural Networks Learn Compositional Data: The Random Hierarchy\n Model","summary":" Deep learning algorithms demonstrate a surprising ability to learn\nhigh-dimensional tasks from limited examples. This is commonly attributed to\nthe depth of neural networks, enabling them to build a hierarchy of abstract,\nlow-dimensional data representations. However, how many training examples are\nrequired to learn such representations remains unknown. To quantitatively study\nthis question, we introduce the Random Hierarchy Model: a family of synthetic\ntasks inspired by the hierarchical structure of language and images. The model\nis a classification task where each class corresponds to a group of high-level\nfeatures, chosen among several equivalent groups associated with the same\nclass. In turn, each feature corresponds to a group of sub-features chosen\namong several equivalent ones and so on, following a hierarchy of composition\nrules. We find that deep networks learn the task by developing internal\nrepresentations invariant to exchanging equivalent groups. Moreover, the number\nof data required corresponds to the point where correlations between low-level\nfeatures and classes become detectable. Overall, our results indicate how deep\nnetworks overcome the curse of dimensionality by building invariant\nrepresentations, and provide an estimate of the number of data required to\nlearn a hierarchical task.\n","authors":["Francesco Cagnetta","Leonardo Petrini","Umberto M. Tomasini","Alessandro Favero","Matthieu Wyart"],"pdf_url":"https://arxiv.org/pdf/2307.02129v5.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.02881v1","updated":"2024-07-03T07:56:51Z","published":"2024-07-03T07:56:51Z","title":"ShiftAddAug: Augment Multiplication-Free Tiny Neural Network with Hybrid\n Computation","summary":" Operators devoid of multiplication, such as Shift and Add, have gained\nprominence for their compatibility with hardware. However, neural networks\n(NNs) employing these operators typically exhibit lower accuracy compared to\nconventional NNs with identical structures. ShiftAddAug uses costly\nmultiplication to augment efficient but less powerful multiplication-free\noperators, improving performance without any inference overhead. It puts a\nShiftAdd tiny NN into a large multiplicative model and encourages it to be\ntrained as a sub-model to obtain additional supervision. In order to solve the\nweight discrepancy problem between hybrid operators, a new weight sharing\nmethod is proposed. Additionally, a novel two stage neural architecture search\nis used to obtain better augmentation effects for smaller but stronger\nmultiplication-free tiny neural networks. The superiority of ShiftAddAug is\nvalidated through experiments in image classification and semantic\nsegmentation, consistently delivering noteworthy enhancements. Remarkably, it\nsecures up to a 4.95% increase in accuracy on the CIFAR100 compared to its\ndirectly trained counterparts, even surpassing the performance of\nmultiplicative NNs.\n","authors":["Yipin Guo","Zihao Li","Yilin Lang","Qinyuan Ren"],"pdf_url":"https://arxiv.org/pdf/2407.02881v1.pdf","comment":"Accepted by 2024 CVPR Workshop : Efficient Deep Learning for Computer\n Vision"},{"id":"http://arxiv.org/abs/2407.02880v1","updated":"2024-07-03T07:54:08Z","published":"2024-07-03T07:54:08Z","title":"Knowledge Composition using Task Vectors with Learned Anisotropic\n Scaling","summary":" Pre-trained models produce strong generic representations that can be adapted\nvia fine-tuning. The learned weight difference relative to the pre-trained\nmodel, known as a task vector, characterises the direction and stride of\nfine-tuning. The significance of task vectors is such that simple arithmetic\noperations on them can be used to combine diverse representations from\ndifferent domains. This paper builds on these properties of task vectors and\naims to answer (1) whether components of task vectors, particularly parameter\nblocks, exhibit similar characteristics, and (2) how such blocks can be used to\nenhance knowledge composition and transfer. To this end, we introduce aTLAS, an\nalgorithm that linearly combines parameter blocks with different learned\ncoefficients, resulting in anisotropic scaling at the task vector level. We\nshow that such linear combinations explicitly exploit the low intrinsic\ndimensionality of pre-trained models, with only a few coefficients being the\nlearnable parameters. Furthermore, composition of parameter blocks leverages\nthe already learned representations, thereby reducing the dependency on large\namounts of data. We demonstrate the effectiveness of our method in task\narithmetic, few-shot recognition and test-time adaptation, with supervised or\nunsupervised objectives. In particular, we show that (1) learned anisotropic\nscaling allows task vectors to be more disentangled, causing less interference\nin composition; (2) task vector composition excels with scarce or no labeled\ndata and is less prone to domain shift, thus leading to better\ngeneralisability; (3) mixing the most informative parameter blocks across\ndifferent task vectors prior to training can reduce the memory footprint and\nimprove the flexibility of knowledge transfer. Moreover, we show the potential\nof aTLAS as a PEFT method, particularly with less data, and demonstrate that\nits scalibility.\n","authors":["Frederic Z. Zhang","Paul Albert","Cristian Rodriguez-Opazo","Anton van den Hengel","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.02880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02871v1","updated":"2024-07-03T07:37:09Z","published":"2024-07-03T07:37:09Z","title":"LMBF-Net: A Lightweight Multipath Bidirectional Focal Attention Network\n for Multifeatures Segmentation","summary":" Retinal diseases can cause irreversible vision loss in both eyes if not\ndiagnosed and treated early. Since retinal diseases are so complicated, retinal\nimaging is likely to show two or more abnormalities. Current deep learning\ntechniques for segmenting retinal images with many labels and attributes have\npoor detection accuracy and generalisability. This paper presents a multipath\nconvolutional neural network for multifeature segmentation. The proposed\nnetwork is lightweight and spatially sensitive to information. A patch-based\nimplementation is used to extract local image features, and focal modulation\nattention blocks are incorporated between the encoder and the decoder for\nimproved segmentation. Filter optimisation is used to prevent filter overlaps\nand speed up model convergence. A combination of convolution operations and\ngroup convolution operations is used to reduce computational costs. This is the\nfirst robust and generalisable network capable of segmenting multiple features\nof fundus images (including retinal vessels, microaneurysms, optic discs,\nhaemorrhages, hard exudates, and soft exudates). The results of our\nexperimental evaluation on more than ten publicly available datasets with\nmultiple features show that the proposed network outperforms recent networks\ndespite having a small number of learnable parameters.\n","authors":["Tariq M Khan","Shahzaib Iqbal","Syed S. Naqvi","Imran Razzak","Erik Meijering"],"pdf_url":"https://arxiv.org/pdf/2407.02871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04032v3","updated":"2024-07-03T07:36:43Z","published":"2023-02-08T13:08:51Z","title":"A Systematic Performance Analysis of Deep Perceptual Loss Networks:\n Breaking Transfer Learning Conventions","summary":" In recent years, deep perceptual loss has been widely and successfully used\nto train machine learning models for many computer vision tasks, including\nimage synthesis, segmentation, and autoencoding. Deep perceptual loss is a type\nof loss function for images that computes the error between two images as the\ndistance between deep features extracted from a neural network. Most\napplications of the loss use pretrained networks called loss networks for deep\nfeature extraction. However, despite increasingly widespread use, the effects\nof loss network implementation on the trained models have not been studied.\n This work rectifies this through a systematic evaluation of the effect of\ndifferent pretrained loss networks on four different application areas.\nSpecifically, the work evaluates 14 different pretrained architectures with\nfour different feature extraction layers. The evaluation reveals that VGG\nnetworks without batch normalization have the best performance and that the\nchoice of feature extraction layer is at least as important as the choice of\narchitecture. The analysis also reveals that deep perceptual loss does not\nadhere to the transfer learning conventions that better ImageNet accuracy\nimplies better downstream performance and that feature extraction from the\nlater layers provides better performance.\n","authors":["Gustav Grund Pihlgren","Konstantina Nikolaidou","Prakash Chandra Chhipa","Nosheen Abid","Rajkumar Saini","Fredrik Sandin","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2302.04032v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16537v3","updated":"2024-07-03T07:26:35Z","published":"2024-06-24T11:16:37Z","title":"Character-Adapter: Prompt-Guided Region Control for High-Fidelity\n Character Customization","summary":" Customized image generation, which seeks to synthesize images with consistent\ncharacters, holds significant relevance for applications such as storytelling,\nportrait generation, and character design. However, previous approaches have\nencountered challenges in preserving characters with high-fidelity consistency\ndue to inadequate feature extraction and concept confusion of reference\ncharacters. Therefore, we propose Character-Adapter, a plug-and-play framework\ndesigned to generate images that preserve the details of reference characters,\nensuring high-fidelity consistency. Character-Adapter employs prompt-guided\nsegmentation to ensure fine-grained regional features of reference characters\nand dynamic region-level adapters to mitigate concept confusion. Extensive\nexperiments are conducted to validate the effectiveness of Character-Adapter.\nBoth quantitative and qualitative results demonstrate that Character-Adapter\nachieves the state-of-the-art performance of consistent character generation,\nwith an improvement of 24.8% compared with other methods. Our code will be\nreleased at https://github.com/Character-Adapter/Character-Adapter\n","authors":["Yuhang Ma","Wenting Xu","Jiji Tang","Qinfeng Jin","Rongsheng Zhang","Zeng Zhao","Changjie Fan","Zhipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2406.16537v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02863v1","updated":"2024-07-03T07:22:21Z","published":"2024-07-03T07:22:21Z","title":"Fast maneuver recovery from aerial observation: trajectory clustering\n and outliers rejection","summary":" The implementation of road user models that realistically reproduce a\ncredible behavior in a multi-agentsimulation is still an open problem. A\ndata-driven approach consists on to deduce behaviors that may exist in real\nsituation to obtain different types of trajectories from a large set of\nobservations. The data, and its classification, could then be used to train\nmodels capable to extrapolate such behavior. Cars and two different types of\nVulnerable Road Users (VRU) will be considered by the trajectory clustering\nmethods proposed: pedestrians and cyclists. The results reported here evaluate\nmethods to extract well-defined trajectory classes from raw data without the\nuse of map information while also separating ''eccentric'' or incomplete\ntrajectories from the ones that are complete and representative in any\nscenario. Two environments will serve as test for the methods develop, three\ndifferent intersections and one roundabout. The resulting clusters of\ntrajectories can then be used for prediction or learning tasks or discarded if\nit is composed by outliers.\n","authors":["Nelson de Moura","Augustin Gervreau-Mercier","Fernando Garrido","Fawzi Nashashibi"],"pdf_url":"https://arxiv.org/pdf/2407.02863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16020v2","updated":"2024-07-03T07:21:18Z","published":"2024-03-24T05:50:00Z","title":"PaPr: Training-Free One-Step Patch Pruning with Lightweight ConvNets for\n Faster Inference","summary":" As deep neural networks evolve from convolutional neural networks (ConvNets)\nto advanced vision transformers (ViTs), there is an increased need to eliminate\nredundant data for faster processing without compromising accuracy. Previous\nmethods are often architecture-specific or necessitate re-training, restricting\ntheir applicability with frequent model updates. To solve this, we first\nintroduce a novel property of lightweight ConvNets: their ability to identify\nkey discriminative patch regions in images, irrespective of model's final\naccuracy or size. We demonstrate that fully-connected layers are the primary\nbottleneck for ConvNets performance, and their suppression with simple weight\nrecalibration markedly enhances discriminative patch localization performance.\nUsing this insight, we introduce PaPr, a method for substantially pruning\nredundant patches with minimal accuracy loss using lightweight ConvNets across\na variety of deep learning architectures, including ViTs, ConvNets, and hybrid\ntransformers, without any re-training. Moreover, the simple early-stage\none-step patch pruning with PaPr enhances existing patch reduction methods.\nThrough extensive testing on diverse architectures, PaPr achieves significantly\nhigher accuracy over state-of-the-art patch reduction methods with similar FLOP\ncount reduction. More specifically, PaPr reduces about 70% of redundant patches\nin videos with less than 0.8% drop in accuracy, and up to 3.7x FLOPs reduction,\nwhich is a 15% more reduction with 2.5% higher accuracy. Code is released at\nhttps://github.com/tanvir-utexas/PaPr.\n","authors":["Tanvir Mahmud","Burhaneddin Yaman","Chun-Hao Liu","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2403.16020v2.pdf","comment":"Accepted in ECCV 2024. Code: https://github.com/tanvir-utexas/PaPr"},{"id":"http://arxiv.org/abs/2312.12223v4","updated":"2024-07-03T07:15:51Z","published":"2023-12-19T15:11:46Z","title":"Self-Supervised Detection of Perfect and Partial Input-Dependent\n Symmetries","summary":" Group equivariance can overly constrain models if the symmetries in the group\ndiffer from those observed in data. While common methods address this by\ndetermining the appropriate level of symmetry at the dataset level, they are\nlimited to supervised settings and ignore scenarios in which multiple levels of\nsymmetry co-exist in the same dataset. In this paper, we propose a method able\nto detect the level of symmetry of each input without the need for labels. Our\nframework is general enough to accommodate different families of both\ncontinuous and discrete symmetry distributions, such as arbitrary unimodal,\nsymmetric distributions and discrete groups. We validate the effectiveness of\nour approach on synthetic datasets with different per-class levels of\nsymmetries, and demonstrate practical applications such as the detection of\nout-of-distribution symmetries. Our code is publicly available at\nhttps://github.com/aurban0/ssl-sym.\n","authors":["Alonso Urbano","David W. Romero"],"pdf_url":"https://arxiv.org/pdf/2312.12223v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02854v1","updated":"2024-07-03T07:12:36Z","published":"2024-07-03T07:12:36Z","title":"Universal Gloss-level Representation for Gloss-free Sign Language\n Translation and Production","summary":" Sign language, essential for the deaf and hard-of-hearing, presents unique\nchallenges in translation and production due to its multimodal nature and the\ninherent ambiguity in mapping sign language motion to spoken language words.\nPrevious methods often rely on gloss annotations, requiring time-intensive\nlabor and specialized expertise in sign language. Gloss-free methods have\nemerged to address these limitations, but they often depend on external sign\nlanguage data or dictionaries, failing to completely eliminate the need for\ngloss annotations. There is a clear demand for a comprehensive approach that\ncan supplant gloss annotations and be utilized for both Sign Language\nTranslation (SLT) and Sign Language Production (SLP). We introduce Universal\nGloss-level Representation (UniGloR), a unified and self-supervised solution\nfor both SLT and SLP, trained on multiple datasets including PHOENIX14T,\nHow2Sign, and NIASL2021. Our results demonstrate UniGloR's effectiveness in the\ntranslation and production tasks. We further report an encouraging result for\nthe Sign Language Recognition (SLR) on previously unseen data. Our study\nsuggests that self-supervised learning can be made in a unified manner, paving\nthe way for innovative and practical applications in future research.\n","authors":["Eui Jun Hwang","Sukmin Cho","Huije Lee","Youngwoo Yoon","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2407.02854v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.02280v2","updated":"2024-07-03T07:12:34Z","published":"2024-07-02T14:08:55Z","title":"FedIA: Federated Medical Image Segmentation with Heterogeneous\n Annotation Completeness","summary":" Federated learning has emerged as a compelling paradigm for medical image\nsegmentation, particularly in light of increasing privacy concerns. However,\nmost of the existing research relies on relatively stringent assumptions\nregarding the uniformity and completeness of annotations across clients.\nContrary to this, this paper highlights a prevalent challenge in medical\npractice: incomplete annotations. Such annotations can introduce incorrectly\nlabeled pixels, potentially undermining the performance of neural networks in\nsupervised learning. To tackle this issue, we introduce a novel solution, named\nFedIA. Our insight is to conceptualize incomplete annotations as noisy data\n(i.e., low-quality data), with a focus on mitigating their adverse effects. We\nbegin by evaluating the completeness of annotations at the client level using a\ndesigned indicator. Subsequently, we enhance the influence of clients with more\ncomprehensive annotations and implement corrections for incomplete ones,\nthereby ensuring that models are trained on accurate data. Our method's\neffectiveness is validated through its superior performance on two extensively\nused medical image segmentation datasets, outperforming existing solutions. The\ncode is available at https://github.com/HUSTxyy/FedIA.\n","authors":["Yangyang Xiang","Nannan Wu","Li Yu","Xin Yang","Kwang-Ting Cheng","Zengqiang Yan"],"pdf_url":"https://arxiv.org/pdf/2407.02280v2.pdf","comment":"Early accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.02853v1","updated":"2024-07-03T07:11:18Z","published":"2024-07-03T07:11:18Z","title":"Plant Doctor: A hybrid machine learning and image segmentation software\n to quantify plant damage in video footage","summary":" Artificial intelligence has significantly advanced the automation of\ndiagnostic processes, benefiting various fields including agriculture. This\nstudy introduces an AI-based system for the automatic diagnosis of urban street\nplants using video footage obtained with accessible camera devices. The system\naims to monitor plant health on a day-to-day basis, aiding in the control of\ndisease spreading in urban areas. By combining two machine vision algorithms,\nYOLOv8 and DeepSORT, the system efficiently identifies and tracks individual\nleaves, extracting the optimal images for health analysis. YOLOv8, chosen for\nits speed and computational efficiency, locates leaves, while DeepSORT ensures\nrobust tracking in complex environments. For detailed health assessment,\nDeepLabV3Plus, a convolutional neural network, is employed to segment and\nquantify leaf damage caused by bacteria, pests, and fungi. The hybrid system,\nnamed Plant Doctor, has been trained and validated using a diverse dataset\nincluding footage from Tokyo urban plants. The results demonstrate the\nrobustness and accuracy of the system in diagnosing leaf damage, with potential\napplications in large scale urban flora illness monitoring. This approach\nprovides a non-invasive, efficient, and scalable solution for urban tree health\nmanagement, supporting sustainable urban ecosystems.\n","authors":["Marc Josep Montagut Marques","Liu Mingxin","Kuri Thomas Shiojiri","Tomika Hagiwara","Kayo Hirose","Kaori Shiojiri","Shinjiro Umezu"],"pdf_url":"https://arxiv.org/pdf/2407.02853v1.pdf","comment":"29 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.01851v2","updated":"2024-07-03T07:01:30Z","published":"2024-07-01T23:32:25Z","title":"Meerkat: Audio-Visual Large Language Model for Grounding in Space and\n Time","summary":" Leveraging Large Language Models' remarkable proficiency in text-based tasks,\nrecent works on Multi-modal LLMs (MLLMs) extend them to other modalities like\nvision and audio. However, the progress in these directions has been mostly\nfocused on tasks that only require a coarse-grained understanding of the\naudio-visual semantics. We present Meerkat, an audio-visual LLM equipped with a\nfine-grained understanding of image and audio both spatially and temporally.\nWith a new modality alignment module based on optimal transport and a\ncross-attention module that enforces audio-visual consistency, Meerkat can\ntackle challenging tasks such as audio referred image grounding, image guided\naudio temporal localization, and audio-visual fact-checking. Moreover, we\ncarefully curate a large dataset AVFIT that comprises 3M instruction tuning\nsamples collected from open-source datasets, and introduce MeerkatBench that\nunifies five challenging audio-visual tasks. We achieve state-of-the-art\nperformance on all these downstream tasks with a relative improvement of up to\n37.12%.\n","authors":["Sanjoy Chowdhury","Sayan Nag","Subhrajyoti Dasgupta","Jun Chen","Mohamed Elhoseiny","Ruohan Gao","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2407.01851v2.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2208.05617v2","updated":"2024-07-03T06:50:39Z","published":"2022-08-11T02:57:30Z","title":"Language-Guided Face Animation by Recurrent StyleGAN-based Generator","summary":" Recent works on language-guided image manipulation have shown great power of\nlanguage in providing rich semantics, especially for face images. However, the\nother natural information, motions, in language is less explored. In this\npaper, we leverage the motion information and study a novel task,\nlanguage-guided face animation, that aims to animate a static face image with\nthe help of languages. To better utilize both semantics and motions from\nlanguages, we propose a simple yet effective framework. Specifically, we\npropose a recurrent motion generator to extract a series of semantic and motion\ninformation from the language and feed it along with visual information to a\npre-trained StyleGAN to generate high-quality frames. To optimize the proposed\nframework, three carefully designed loss functions are proposed including a\nregularization loss to keep the face identity, a path length regularization\nloss to ensure motion smoothness, and a contrastive loss to enable video\nsynthesis with various language guidance in one single model. Extensive\nexperiments with both qualitative and quantitative evaluations on diverse\ndomains (\\textit{e.g.,} human face, anime face, and dog face) demonstrate the\nsuperiority of our model in generating high-quality and realistic videos from\none still image with the guidance of language. Code will be available at\nhttps://github.com/TiankaiHang/language-guided-animation.git.\n","authors":["Tiankai Hang","Huan Yang","Bei Liu","Jianlong Fu","Xin Geng","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2208.05617v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02846v1","updated":"2024-07-03T06:47:58Z","published":"2024-07-03T06:47:58Z","title":"Multi-Task Domain Adaptation for Language Grounding with 3D Objects","summary":" The existing works on object-level language grounding with 3D objects mostly\nfocus on improving performance by utilizing the off-the-shelf pre-trained\nmodels to capture features, such as viewpoint selection or geometric priors.\nHowever, they have failed to consider exploring the cross-modal representation\nof language-vision alignment in the cross-domain field. To answer this problem,\nwe propose a novel method called Domain Adaptation for Language Grounding\n(DA4LG) with 3D objects. Specifically, the proposed DA4LG consists of a visual\nadapter module with multi-task learning to realize vision-language alignment by\ncomprehensive multimodal feature representation. Experimental results\ndemonstrate that DA4LG competitively performs across visual and non-visual\nlanguage descriptions, independent of the completeness of observation. DA4LG\nachieves state-of-the-art performance in the single-view setting and multi-view\nsetting with the accuracy of 83.8% and 86.8% respectively in the language\ngrounding benchmark SNARE. The simulation experiments show the well-practical\nand generalized performance of DA4LG compared to the existing methods. Our\nproject is available at https://sites.google.com/view/da4lg.\n","authors":["Penglei Sun","Yaoxian Song","Xinglin Pan","Peijie Dong","Xiaofei Yang","Qiang Wang","Zhixu Li","Tiefeng Li","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2407.02846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01926v2","updated":"2024-07-03T06:40:52Z","published":"2024-07-02T03:43:39Z","title":"Chemical Shift Encoding based Double Bonds Quantification in\n Triglycerides using Deep Image Prior","summary":" This study evaluated a deep learning-based method using Deep Image Prior\n(DIP) to quantify triglyceride double bonds from chemical-shift encoded\nmulti-echo gradient echo images without network training. We employed a cost\nfunction based on signal constraints to iteratively update the neural network\non a single dataset. The method was validated using phantom experiments and in\nvivo scans. Results showed close alignment between measured and reference\ndouble bond values, with phantom experiments yielding a Pearson correlation\ncoefficient of 0.96 (p = .0005). In vivo results demonstrated good agreement in\nsubcutaneous fat. We conclude that Deep Image Prior shows feasibility for\nquantifying double bonds and fatty acid content from chemical-shift encoded\nmulti-echo MRI.\n","authors":["Chaoxing Huang","Ziqiang Yu","Zijian Gao","Qiuyi Shen","Queenie Chan","Vincent Wai-Sun Wong","Winnie Chiu-Wing Chu","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2407.01926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02844v1","updated":"2024-07-03T06:40:26Z","published":"2024-07-03T06:40:26Z","title":"Multi-Attention Integrated Deep Learning Frameworks for Enhanced Breast\n Cancer Segmentation and Identification","summary":" Breast cancer poses a profound threat to lives globally, claiming numerous\nlives each year. Therefore, timely detection is crucial for early intervention\nand improved chances of survival. Accurately diagnosing and classifying breast\ntumors using ultrasound images is a persistent challenge in medicine, demanding\ncutting-edge solutions for improved treatment strategies. This research\nintroduces multiattention-enhanced deep learning (DL) frameworks designed for\nthe classification and segmentation of breast cancer tumors from ultrasound\nimages. A spatial channel attention mechanism is proposed for segmenting tumors\nfrom ultrasound images, utilizing a novel LinkNet DL framework with an\nInceptionResNet backbone. Following this, the paper proposes a deep\nconvolutional neural network with an integrated multi-attention framework\n(DCNNIMAF) to classify the segmented tumor as benign, malignant, or normal.\nFrom experimental results, it is observed that the segmentation model has\nrecorded an accuracy of 98.1%, with a minimal loss of 0.6%. It has also\nachieved high Intersection over Union (IoU) and Dice Coefficient scores of\n96.9% and 97.2%, respectively. Similarly, the classification model has attained\nan accuracy of 99.2%, with a low loss of 0.31%. Furthermore, the classification\nframework has achieved outstanding F1-Score, precision, and recall values of\n99.1%, 99.3%, and 99.1%, respectively. By offering a robust framework for early\ndetection and accurate classification of breast cancer, this proposed work\nsignificantly advances the field of medical image analysis, potentially\nimproving diagnostic precision and patient outcomes.\n","authors":["Pandiyaraju V","Shravan Venkatraman","Pavan Kumar S","Santhosh Malarvannan","Kannan A"],"pdf_url":"https://arxiv.org/pdf/2407.02844v1.pdf","comment":"32 pages, 18 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.02842v1","updated":"2024-07-03T06:39:18Z","published":"2024-07-03T06:39:18Z","title":"MindBench: A Comprehensive Benchmark for Mind Map Structure Recognition\n and Analysis","summary":" Multimodal Large Language Models (MLLM) have made significant progress in the\nfield of document analysis. Despite this, existing benchmarks typically focus\nonly on extracting text and simple layout information, neglecting the complex\ninteractions between elements in structured documents such as mind maps and\nflowcharts. To address this issue, we introduce the new benchmark named\nMindBench, which not only includes meticulously constructed bilingual authentic\nor synthetic images, detailed annotations, evaluation metrics and baseline\nmodels, but also specifically designs five types of structured understanding\nand parsing tasks. These tasks include full parsing, partial parsing,\nposition-related parsing, structured Visual Question Answering (VQA), and\nposition-related VQA, covering key areas such as text recognition, spatial\nawareness, relationship discernment, and structured parsing. Extensive\nexperimental results demonstrate the substantial potential and significant room\nfor improvement in current models' ability to handle structured document\ninformation. We anticipate that the launch of MindBench will significantly\nadvance research and application development in structured document analysis\ntechnology. MindBench is available at:\nhttps://miasanlei.github.io/MindBench.github.io/.\n","authors":["Lei Chen","Feng Yan","Yujie Zhong","Shaoxiang Chen","Zequn Jie","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2407.02842v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2407.02835v1","updated":"2024-07-03T06:25:20Z","published":"2024-07-03T06:25:20Z","title":"A Pairwise DomMix Attentive Adversarial Network for Unsupervised Domain\n Adaptive Object Detection","summary":" Unsupervised Domain Adaptive Object Detection (DAOD) could adapt a model\ntrained on a source domain to an unlabeled target domain for object detection.\nExisting unsupervised DAOD methods usually perform feature alignments from the\ntarget to the source. Unidirectional domain transfer would omit information\nabout the target samples and result in suboptimal adaptation when there are\nlarge domain shifts. Therefore, we propose a pairwise attentive adversarial\nnetwork with a Domain Mixup (DomMix) module to mitigate the aforementioned\nchallenges. Specifically, a deep-level mixup is employed to construct an\nintermediate domain that allows features from both domains to share their\ndifferences. Then a pairwise attentive adversarial network is applied with\nattentive encoding on both image-level and instance-level features at different\nscales and optimizes domain alignment by adversarial learning. This allows the\nnetwork to focus on regions with disparate contextual information and learn\ntheir similarities between different domains. Extensive experiments are\nconducted on several benchmark datasets, demonstrating the superiority of our\nproposed method.\n","authors":["Jie Shao","Jiacheng Wu","Wenzhong Shen","Cheng Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02835v1.pdf","comment":"has published on IEEE Signal Processing Letters, 2023"},{"id":"http://arxiv.org/abs/2407.02832v1","updated":"2024-07-03T06:19:42Z","published":"2024-07-03T06:19:42Z","title":"Style Alignment based Dynamic Observation Method for UAV-View\n Geo-localization","summary":" The task of UAV-view geo-localization is to estimate the localization of a\nquery satellite/drone image by matching it against a reference dataset\nconsisting of drone/satellite images. Though tremendous strides have been made\nin feature alignment between satellite and drone views, vast differences in\nboth inter and intra-class due to changes in viewpoint, altitude, and lighting\nremain a huge challenge. In this paper, a style alignment based dynamic\nobservation method for UAV-view geo-localization is proposed to meet the above\nchallenges from two perspectives: visual style transformation and surrounding\nnoise control. Specifically, we introduce a style alignment strategy to\ntransfrom the diverse visual style of drone-view images into a unified\nsatellite images visual style. Then a dynamic observation module is designed to\nevaluate the spatial distribution of images by mimicking human observation\nhabits. It is featured by the hierarchical attention block (HAB) with a\ndual-square-ring stream structure, to reduce surrounding noise and geographical\ndeformation. In addition, we propose a deconstruction loss to push away\nfeatures of different geo-tags and squeeze knowledge from unmatched images by\ncorrelation calculation. The experimental results demonstrate the\nstate-of-the-art performance of our model on benchmarked datasets. In\nparticular, when compared to the prior art on University-1652, our results\nsurpass the best of them (FSRA), while only requiring 2x fewer parameters. Code\nwill be released at https://github.com/Xcco1/SA\\_DOM\n","authors":["Jie Shao","LingHao Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.02832v1.pdf","comment":"has published on IEEE Transactions on Geoscience and Remote Sensing,\n 2023"},{"id":"http://arxiv.org/abs/2407.02830v1","updated":"2024-07-03T06:17:41Z","published":"2024-07-03T06:17:41Z","title":"A Radiometric Correction based Optical Modeling Approach to Removing\n Reflection Noise in TLS Point Clouds of Urban Scenes","summary":" Point clouds are vital in computer vision tasks such as 3D reconstruction,\nautonomous driving, and robotics. However, TLS-acquired point clouds often\ncontain virtual points from reflective surfaces, causing disruptions. This\nstudy presents a reflection noise elimination algorithm for TLS point clouds.\nOur innovative reflection plane detection algorithm, based on geometry-optical\nmodels and physical properties, identifies and categorizes reflection points\nper optical reflection theory. We've adapted the LSFH feature descriptor to\nretain reflection features, mitigating interference from symmetrical\narchitectural structures. By incorporating the Hausdorff feature distance, the\nalgorithm enhances resilience to ghosting and deformation, improving virtual\npoint detection accuracy. Extensive experiments on the 3DRN benchmark dataset,\nfeaturing diverse urban environments with virtual TLS reflection noise, show\nour algorithm improves precision and recall rates for 3D points in reflective\nregions by 57.03\\% and 31.80\\%, respectively. Our method achieves a 9.17\\%\nbetter outlier detection rate and 5.65\\% higher accuracy than leading methods.\nAccess the 3DRN dataset at (https://github.com/Tsuiky/3DRN).\n","authors":["Li Fang","Tianyu Li","Yanghong Lin","Shudong Zhou","Wei Yao"],"pdf_url":"https://arxiv.org/pdf/2407.02830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17376v3","updated":"2024-07-03T06:16:31Z","published":"2024-02-27T10:13:30Z","title":"Accelerating Diffusion Sampling with Optimized Time Steps","summary":" Diffusion probabilistic models (DPMs) have shown remarkable performance in\nhigh-resolution image synthesis, but their sampling efficiency is still to be\ndesired due to the typically large number of sampling steps. Recent\nadvancements in high-order numerical ODE solvers for DPMs have enabled the\ngeneration of high-quality images with much fewer sampling steps. While this is\na significant development, most sampling methods still employ uniform time\nsteps, which is not optimal when using a small number of steps. To address this\nissue, we propose a general framework for designing an optimization problem\nthat seeks more appropriate time steps for a specific numerical ODE solver for\nDPMs. This optimization problem aims to minimize the distance between the\nground-truth solution to the ODE and an approximate solution corresponding to\nthe numerical solver. It can be efficiently solved using the constrained trust\nregion method, taking less than $15$ seconds. Our extensive experiments on both\nunconditional and conditional sampling using pixel- and latent-space DPMs\ndemonstrate that, when combined with the state-of-the-art sampling method\nUniPC, our optimized time steps significantly improve image generation\nperformance in terms of FID scores for datasets such as CIFAR-10 and ImageNet,\ncompared to using uniform time steps.\n","authors":["Shuchen Xue","Zhaoqiang Liu","Fei Chen","Shifeng Zhang","Tianyang Hu","Enze Xie","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2402.17376v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2403.11107v3","updated":"2024-07-03T06:10:52Z","published":"2024-03-17T06:21:21Z","title":"Self-supervised co-salient object detection via feature correspondence\n at multiple scales","summary":" Our paper introduces a novel two-stage self-supervised approach for detecting\nco-occurring salient objects (CoSOD) in image groups without requiring\nsegmentation annotations. Unlike existing unsupervised methods that rely solely\non patch-level information (e.g. clustering patch descriptors) or on\ncomputation heavy off-the-shelf components for CoSOD, our lightweight model\nleverages feature correspondences at both patch and region levels,\nsignificantly improving prediction performance. In the first stage, we train a\nself-supervised network that detects co-salient regions by computing local\npatch-level feature correspondences across images. We obtain the segmentation\npredictions using confidence-based adaptive thresholding. In the next stage, we\nrefine these intermediate segmentations by eliminating the detected regions\n(within each image) whose averaged feature representations are dissimilar to\nthe foreground feature representation averaged across all the cross-attention\nmaps (from the previous stage). Extensive experiments on three CoSOD benchmark\ndatasets show that our self-supervised model outperforms the corresponding\nstate-of-the-art models by a huge margin (e.g. on the CoCA dataset, our model\nhas a 13.7% F-measure gain over the SOTA unsupervised CoSOD model). Notably,\nour self-supervised model also outperforms several recent fully supervised\nCoSOD models on the three test datasets (e.g., on the CoCA dataset, our model\nhas a 4.6% F-measure gain over a recent supervised CoSOD model).\n","authors":["Souradeep Chakraborty","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2403.11107v3.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2406.05184v2","updated":"2024-07-03T06:00:50Z","published":"2024-06-07T18:04:21Z","title":"The Unmet Promise of Synthetic Training Images: Using Retrieved Real\n Images Performs Better","summary":" Generative text-to-image models enable us to synthesize unlimited amounts of\nimages in a controllable manner, spurring many recent efforts to train vision\nmodels with synthetic data. However, every synthetic image ultimately\noriginates from the upstream data used to train the generator. What additional\nvalue does the intermediate generator provide over directly training on\nrelevant parts of the upstream data? Grounding this question in the setting of\nimage classification,a we compare finetuning on task-relevant, targeted\nsynthetic data generated by Stable Diffusion -- a generative model trained on\nthe LAION-2B dataset -- against finetuning on targeted real images retrieved\ndirectly from LAION-2B. We show that while synthetic data can benefit some\ndownstream tasks, it is universally matched or outperformed by real data from\nour simple retrieval baseline. Our analysis suggests that this underperformance\nis partially due to generator artifacts and inaccurate task-relevant visual\ndetails in the synthetic images. Overall, we argue that retrieval is a critical\nbaseline to consider when training with synthetic data -- a baseline that\ncurrent methods do not yet surpass. We release code, data, and models at\nhttps://github.com/scottgeng00/unmet-promise.\n","authors":["Scott Geng","Cheng-Yu Hsieh","Vivek Ramanujan","Matthew Wallingford","Chun-Liang Li","Pang Wei Koh","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2406.05184v2.pdf","comment":"Correspondence to sgeng at cs dot washington dot edu. RK and PWK\n equally advised the project"},{"id":"http://arxiv.org/abs/2407.02394v2","updated":"2024-07-03T05:36:23Z","published":"2024-07-02T16:12:37Z","title":"Similarity Distance-Based Label Assignment for Tiny Object Detection","summary":" Tiny object detection is becoming one of the most challenging tasks in\ncomputer vision because of the limited object size and lack of information. The\nlabel assignment strategy is a key factor affecting the accuracy of object\ndetection. Although there are some effective label assignment strategies for\ntiny objects, most of them focus on reducing the sensitivity to the bounding\nboxes to increase the number of positive samples and have some fixed\nhyperparameters need to set. However, more positive samples may not necessarily\nlead to better detection results, in fact, excessive positive samples may lead\nto more false positives. In this paper, we introduce a simple but effective\nstrategy named the Similarity Distance (SimD) to evaluate the similarity\nbetween bounding boxes. This proposed strategy not only considers both location\nand shape similarity but also learns hyperparameters adaptively, ensuring that\nit can adapt to different datasets and various object sizes in a dataset. Our\napproach can be simply applied in common anchor-based detectors in place of the\nIoU for label assignment and Non Maximum Suppression (NMS). Extensive\nexperiments on four mainstream tiny object detection datasets demonstrate\nsuperior performance of our method, especially, 1.8 AP points and 4.1 AP points\nof very tiny higher than the state-of-the-art competitors on AI-TOD. Code is\navailable at: \\url{https://github.com/cszzshi/SimD}.\n","authors":["Shuohao Shi","Qiang Fang","Tong Zhao","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02394v2.pdf","comment":"8 pages, 4 figures, this paper has been accepted by IEEE/RSJ\n International Conference on Intelligent Robots and Systems"},{"id":"http://arxiv.org/abs/2407.01012v3","updated":"2024-07-03T05:36:00Z","published":"2024-07-01T06:52:34Z","title":"Swish-T : Enhancing Swish Activation with Tanh Bias for Improved Neural\n Network Performance","summary":" We propose the Swish-T family, an enhancement of the existing non-monotonic\nactivation function Swish. Swish-T is defined by adding a Tanh bias to the\noriginal Swish function. This modification creates a family of Swish-T\nvariants, each designed to excel in different tasks, showcasing specific\nadvantages depending on the application context. The Tanh bias allows for\nbroader acceptance of negative values during initial training stages, offering\na smoother non-monotonic curve than the original Swish. We ultimately propose\nthe Swish-T$_{\\textbf{C}}$ function, while Swish-T and Swish-T$_{\\textbf{B}}$,\nbyproducts of Swish-T$_{\\textbf{C}}$, also demonstrate satisfactory\nperformance. Furthermore, our ablation study shows that using\nSwish-T$_{\\textbf{C}}$ as a non-parametric function can still achieve high\nperformance. The superiority of the Swish-T family has been empirically\ndemonstrated across various models and benchmark datasets, including MNIST,\nFashion MNIST, SVHN, CIFAR-10, and CIFAR-100. The code is publicly available at\nhttps://github.com/ictseoyoungmin/Swish-T-pytorch.\n","authors":["Youngmin Seo","Jinha Kim","Unsang Park"],"pdf_url":"https://arxiv.org/pdf/2407.01012v3.pdf","comment":"11 pages, 6 figures Revised the derivative of the sigmoid function\n from 1-sigmoid to sigmoid(1-sigmoid) for correctness.Updated related\n equations in Section 3.2. Conclusions to Conclusion in Section 6"},{"id":"http://arxiv.org/abs/2406.11445v3","updated":"2024-07-03T05:32:37Z","published":"2024-06-17T11:57:14Z","title":"Solving the Inverse Problem of Electrocardiography for Cardiac Digital\n Twins: A Survey","summary":" Cardiac digital twins are personalized virtual representations used to\nunderstand complex heart mechanisms. Solving the ECG inverse problem is crucial\nfor accurate virtual heart modelling, enabling the derivation of internal\nelectrical activity information from recorded surface potentials. Despite\nchallenges from cardiac complexity, noisy ECG data, and computational\nefficiency, recent advancements hold significant promise for enhancing virtual\nheart modelling, ultimately advancing precision medicine in cardiology. This\npaper aims to provide a comprehensive review of the methods of solving ECG\ninverse problem, the validation strategies, the clinical applications, and\nfuture perspectives. For the computing methodologies, we broadly classify\nstate-of-the-art approaches into two categories: deterministic and\nprobabilistic methods, including conventional and deep learning-based\ntechniques. Integrating physics laws with deep learning models holds promise,\nbut challenges such as capturing dynamic electrophysiology accurately,\naccessing accurate domain knowledge, and quantifying prediction uncertainty\npersist. Integrating models into clinical workflows while ensuring\ninterpretability and usability for healthcare professionals is essential.\nOvercoming these challenges will drive further research in cardiac digital\ntwins.\n","authors":["Lei Li","Julia Camps","Blanca Rodriguez","Vicente Grau"],"pdf_url":"https://arxiv.org/pdf/2406.11445v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14760v3","updated":"2024-07-03T05:21:38Z","published":"2024-03-21T18:02:20Z","title":"Can 3D Vision-Language Models Truly Understand Natural Language?","summary":" Rapid advancements in 3D vision-language (3D-VL) tasks have opened up new\navenues for human interaction with embodied agents or robots using natural\nlanguage. Despite this progress, we find a notable limitation: existing 3D-VL\nmodels exhibit sensitivity to the styles of language input, struggling to\nunderstand sentences with the same semantic meaning but written in different\nvariants. This observation raises a critical question: Can 3D vision-language\nmodels truly understand natural language? To test the language\nunderstandability of 3D-VL models, we first propose a language robustness task\nfor systematically assessing 3D-VL models across various tasks, benchmarking\ntheir performance when presented with different language style variants.\nImportantly, these variants are commonly encountered in applications requiring\ndirect interaction with humans, such as embodied robotics, given the diversity\nand unpredictability of human language. We propose a 3D Language Robustness\nDataset, designed based on the characteristics of human language, to facilitate\nthe systematic study of robustness. Our comprehensive evaluation uncovers a\nsignificant drop in the performance of all existing models across various 3D-VL\ntasks. Even the state-of-the-art 3D-LLM fails to understand some variants of\nthe same sentences. Further in-depth analysis suggests that the existing models\nhave a fragile and biased fusion module, which stems from the low diversity of\nthe existing dataset. Finally, we propose a training-free module driven by LLM,\nwhich improves language robustness. Datasets and code will be available at\ngithub.\n","authors":["Weipeng Deng","Jihan Yang","Runyu Ding","Jiahui Liu","Yijiang Li","Xiaojuan Qi","Edith Ngai"],"pdf_url":"https://arxiv.org/pdf/2403.14760v3.pdf","comment":"https://github.com/VincentDENGP/3D-LR"},{"id":"http://arxiv.org/abs/2407.02814v1","updated":"2024-07-03T05:19:45Z","published":"2024-07-03T05:19:45Z","title":"Images Speak Louder than Words: Understanding and Mitigating Bias in\n Vision-Language Model from a Causal Mediation Perspective","summary":" Vision-language models (VLMs) pre-trained on extensive datasets can\ninadvertently learn biases by correlating gender information with specific\nobjects or scenarios. Current methods, which focus on modifying inputs and\nmonitoring changes in the model's output probability scores, often struggle to\ncomprehensively understand bias from the perspective of model components. We\npropose a framework that incorporates causal mediation analysis to measure and\nmap the pathways of bias generation and propagation within VLMs. This approach\nallows us to identify the direct effects of interventions on model bias and the\nindirect effects of interventions on bias mediated through different model\ncomponents. Our results show that image features are the primary contributors\nto bias, with significantly higher impacts than text features, specifically\naccounting for 32.57% and 12.63% of the bias in the MSCOCO and PASCAL-SENTENCE\ndatasets, respectively. Notably, the image encoder's contribution surpasses\nthat of the text encoder and the deep fusion encoder. Further experimentation\nconfirms that contributions from both language and vision modalities are\naligned and non-conflicting. Consequently, focusing on blurring gender\nrepresentations within the image encoder, which contributes most to the model\nbias, reduces bias efficiently by 22.03% and 9.04% in the MSCOCO and\nPASCAL-SENTENCE datasets, respectively, with minimal performance loss or\nincreased computational demands.\n","authors":["Zhaotian Weng","Zijun Gao","Jerone Andrews","Jieyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.02814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02813v1","updated":"2024-07-03T05:17:26Z","published":"2024-07-03T05:17:26Z","title":"Data Overfitting for On-Device Super-Resolution with Dynamic Algorithm\n and Compiler Co-Design","summary":" Deep neural networks (DNNs) are frequently employed in a variety of computer\nvision applications. Nowadays, an emerging trend in the current video\ndistribution system is to take advantage of DNN's overfitting properties to\nperform video resolution upscaling. By splitting videos into chunks and\napplying a super-resolution (SR) model to overfit each chunk, this scheme of SR\nmodels plus video chunks is able to replace traditional video transmission to\nenhance video quality and transmission efficiency. However, many models and\nchunks are needed to guarantee high performance, which leads to tremendous\noverhead on model switching and memory footprints at the user end. To resolve\nsuch problems, we propose a Dynamic Deep neural network assisted by a\nContent-Aware data processing pipeline to reduce the model number down to one\n(Dy-DCA), which helps promote performance while conserving computational\nresources. Additionally, to achieve real acceleration on the user end, we\ndesigned a framework that optimizes dynamic features (e.g., dynamic shapes,\nsizes, and control flow) in Dy-DCA to enable a series of compilation\noptimizations, including fused code generation, static execution planning, etc.\nBy employing such techniques, our method achieves better PSNR and real-time\nperformance (33 FPS) on an off-the-shelf mobile phone. Meanwhile, assisted by\nour compilation optimization, we achieve a 1.7$\\times$ speedup while saving up\nto 1.61$\\times$ memory consumption. Code available in\nhttps://github.com/coulsonlee/Dy-DCA-ECCV2024.\n","authors":["Gen Li","Zhihao Shu","Jie Ji","Minghai Qin","Fatemeh Afghah","Wei Niu","Xiaolong Ma"],"pdf_url":"https://arxiv.org/pdf/2407.02813v1.pdf","comment":"ECCV2024"},{"id":"http://arxiv.org/abs/2401.01456v3","updated":"2024-07-03T04:18:49Z","published":"2024-01-02T22:46:12Z","title":"ColorizeDiffusion: Adjustable Sketch Colorization with Reference Image\n and Text","summary":" Diffusion models have recently demonstrated their effectiveness in generating\nextremely high-quality images and are now utilized in a wide range of\napplications, including automatic sketch colorization. Although many methods\nhave been developed for guided sketch colorization, there has been limited\nexploration of the potential conflicts between image prompts and sketch inputs,\nwhich can lead to severe deterioration in the results. Therefore, this paper\nexhaustively investigates reference-based sketch colorization models that aim\nto colorize sketch images using reference color images. We specifically\ninvestigate two critical aspects of reference-based diffusion models: the\n\"distribution problem\", which is a major shortcoming compared to text-based\ncounterparts, and the capability in zero-shot sequential text-based\nmanipulation. We introduce two variations of an image-guided latent diffusion\nmodel utilizing different image tokens from the pre-trained CLIP image encoder\nand propose corresponding manipulation methods to adjust their results\nsequentially using weighted text inputs. We conduct comprehensive evaluations\nof our models through qualitative and quantitative experiments as well as a\nuser study.\n","authors":["Dingkun Yan","Liang Yuan","Erwin Wu","Yuma Nishioka","Issei Fujishiro","Suguru Saito"],"pdf_url":"https://arxiv.org/pdf/2401.01456v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02797v1","updated":"2024-07-03T03:57:05Z","published":"2024-07-03T03:57:05Z","title":"Solving Motion Planning Tasks with a Scalable Generative Model","summary":" As autonomous driving systems being deployed to millions of vehicles, there\nis a pressing need of improving the system's scalability, safety and reducing\nthe engineering cost. A realistic, scalable, and practical simulator of the\ndriving world is highly desired. In this paper, we present an efficient\nsolution based on generative models which learns the dynamics of the driving\nscenes. With this model, we can not only simulate the diverse futures of a\ngiven driving scenario but also generate a variety of driving scenarios\nconditioned on various prompts. Our innovative design allows the model to\noperate in both full-Autoregressive and partial-Autoregressive modes,\nsignificantly improving inference and training speed without sacrificing\ngenerative capability. This efficiency makes it ideal for being used as an\nonline reactive environment for reinforcement learning, an evaluator for\nplanning policies, and a high-fidelity simulator for testing. We evaluated our\nmodel against two real-world datasets: the Waymo motion dataset and the nuPlan\ndataset. On the simulation realism and scene generation benchmark, our model\nachieves the state-of-the-art performance. And in the planning benchmarks, our\nplanner outperforms the prior arts. We conclude that the proposed generative\nmodel may serve as a foundation for a variety of motion planning tasks,\nincluding data generation, simulation, planning, and online training. Source\ncode is public at https://github.com/HorizonRobotics/GUMP/\n","authors":["Yihan Hu","Siqi Chai","Zhening Yang","Jingyu Qian","Kun Li","Wenxin Shao","Haichao Zhang","Wei Xu","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02797v1.pdf","comment":"ECCV2024"},{"id":"http://arxiv.org/abs/2406.16851v2","updated":"2024-07-03T03:55:59Z","published":"2024-06-24T17:58:03Z","title":"Losing Visual Needles in Image Haystacks: Vision Language Models are\n Easily Distracted in Short and Long Contexts","summary":" We present LoCoVQA, a dynamic benchmark generator for evaluating long-context\nextractive reasoning in vision language models (VLMs). LoCoVQA augments test\nexamples for mathematical reasoning, VQA, and character recognition tasks with\nincreasingly long visual contexts composed of both in-distribution and\nout-of-distribution distractor images.\n Across these tasks, a diverse set of VLMs rapidly lose performance as the\nvisual context length grows, often exhibiting a striking logarithmic decay\ntrend. This test assesses how well VLMs can ignore irrelevant information when\nanswering queries -- a task that is quite easy for language models (LMs) in the\ntext domain -- demonstrating that current state-of-the-art VLMs lack this\nessential capability for many long-context applications.\n","authors":["Aditya Sharma","Michael Saxon","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2406.16851v2.pdf","comment":"Under review. Minor errata correction in revision"},{"id":"http://arxiv.org/abs/2407.02411v2","updated":"2024-07-03T03:48:18Z","published":"2024-07-02T16:34:14Z","title":"Video Watermarking: Safeguarding Your Video from (Unauthorized)\n Annotations by Video-based LLMs","summary":" The advent of video-based Large Language Models (LLMs) has significantly\nenhanced video understanding. However, it has also raised some safety concerns\nregarding data protection, as videos can be more easily annotated, even without\nauthorization. This paper introduces Video Watermarking, a novel technique to\nprotect videos from unauthorized annotations by such video-based LLMs,\nespecially concerning the video content and description, in response to\nspecific queries. By imperceptibly embedding watermarks into key video frames\nwith multi-modal flow-based losses, our method preserves the viewing experience\nwhile preventing misuse by video-based LLMs. Extensive experiments show that\nVideo Watermarking significantly reduces the comprehensibility of videos with\nvarious video-based LLMs, demonstrating both stealth and robustness. In\nessence, our method provides a solution for securing video content, ensuring\nits integrity and confidentiality in the face of evolving video-based LLMs\ntechnologies.\n","authors":["Jinmin Li","Kuofeng Gao","Yang Bai","Jingyun Zhang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2407.02411v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2403.13507"},{"id":"http://arxiv.org/abs/2407.02794v1","updated":"2024-07-03T03:42:33Z","published":"2024-07-03T03:42:33Z","title":"Euler's Elastica Based Cartoon-Smooth-Texture Image Decomposition","summary":" We propose a novel model for decomposing grayscale images into three distinct\ncomponents: the structural part, representing sharp boundaries and regions with\nstrong light-to-dark transitions; the smooth part, capturing soft shadows and\nshades; and the oscillatory part, characterizing textures and noise. To capture\nthe homogeneous structures, we introduce a combination of $L^0$-gradient and\ncurvature regularization on level lines. This new regularization term enforces\nstrong sparsity on the image gradient while reducing the undesirable staircase\neffects as well as preserving the geometry of contours. For the smoothly\nvarying component, we utilize the $L^2$-norm of the Laplacian that favors\nisotropic smoothness. To capture the oscillation, we use the inverse Sobolev\nseminorm. To solve the associated minimization problem, we design an efficient\noperator-splitting algorithm. Our algorithm effectively addresses the\nchallenging non-convex non-smooth problem by separating it into sub-problems.\nEach sub-problem can be solved either directly using closed-form solutions or\nefficiently using the Fast Fourier Transform (FFT). We provide systematic\nexperiments, including ablation and comparison studies, to analyze our model's\nbehaviors and demonstrate its effectiveness as well as efficiency.\n","authors":["Roy Y. He","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02793v1","updated":"2024-07-03T03:42:13Z","published":"2024-07-03T03:42:13Z","title":"Learning Positional Attention for Sequential Recommendation","summary":" Self-attention-based networks have achieved remarkable performance in\nsequential recommendation tasks. A crucial component of these models is\npositional encoding. In this study, we delve into the learned positional\nembedding, demonstrating that it often captures the distance between tokens.\nBuilding on this insight, we introduce novel attention models that directly\nlearn positional relations. Extensive experiments reveal that our proposed\nmodels, \\textbf{PARec} and \\textbf{FPARec} outperform previous\nself-attention-based approaches.Our code is available at the link for anonymous\nreview: https://anonymous.4open.science/ r/FPARec-2C55/\n","authors":["Fan Luo","Juan Zhang","Shenghui Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06116v3","updated":"2024-07-03T03:17:06Z","published":"2024-05-09T21:47:46Z","title":"Rethinking Efficient and Effective Point-based Networks for Event Camera\n Classification and Regression: EventMamba","summary":" Event cameras, drawing inspiration from biological systems, efficiently\ndetect changes in ambient light with low latency and high dynamic range while\nconsuming minimal power. The most current approach to processing event data\noften involves converting it into frame-based representations, which is\nwell-established in traditional vision. However, this approach neglects the\nsparsity of event data, loses fine-grained temporal information during the\ntransformation process, and increases the computational burden, making it\nineffective for characterizing event camera properties. In contrast, Point\nCloud is a popular representation for 3D processing and is better suited to\nmatch the sparse and asynchronous nature of the event camera. Nevertheless,\ndespite the theoretical compatibility of point-based methods with event\ncameras, the results show a performance gap that is not yet satisfactory\ncompared to frame-based methods. In order to bridge the performance gap, we\npropose EventMamba, an efficient and effective Point Cloud framework that\nachieves competitive results even compared to the state-of-the-art (SOTA)\nframe-based method in both classification and regression tasks. This notable\naccomplishment is facilitated by our rethinking of the distinction between\nEvent Cloud and Point Cloud, emphasizing effective temporal information\nextraction through optimized network structures. Specifically, EventMamba\nleverages temporal aggregation and State Space Model (SSM) based Mamba boasting\nenhanced temporal information extraction capabilities. Through a hierarchical\nstructure, EventMamba is adept at abstracting local and global spatial features\nand implicit and explicit temporal features. By adhering to the lightweight\ndesign principle, EventMamba delivers impressive results with minimal\ncomputational resource utilization, demonstrating its efficiency and\neffectiveness.\n","authors":["Hongwei Ren","Yue Zhou","Jiadong Zhu","Haotian Fu","Yulong Huang","Xiaopeng Lin","Yuetong Fang","Fei Ma","Hao Yu","Bojun Cheng"],"pdf_url":"https://arxiv.org/pdf/2405.06116v3.pdf","comment":"Extension Journal of TTPOINT and PEPNet, modify the dataset split\n method"},{"id":"http://arxiv.org/abs/2407.02778v1","updated":"2024-07-03T03:10:24Z","published":"2024-07-03T03:10:24Z","title":"Foster Adaptivity and Balance in Learning with Noisy Labels","summary":" Label noise is ubiquitous in real-world scenarios, posing a practical\nchallenge to supervised models due to its effect in hurting the generalization\nperformance of deep neural networks. Existing methods primarily employ the\nsample selection paradigm and usually rely on dataset-dependent prior knowledge\n(\\eg, a pre-defined threshold) to cope with label noise, inevitably degrading\nthe adaptivity. Moreover, existing methods tend to neglect the class balance in\nselecting samples, leading to biased model performance. To this end, we propose\na simple yet effective approach named \\textbf{SED} to deal with label noise in\na \\textbf{S}elf-adaptiv\\textbf{E} and class-balance\\textbf{D} manner.\nSpecifically, we first design a novel sample selection strategy to empower\nself-adaptivity and class balance when identifying clean and noisy data. A\nmean-teacher model is then employed to correct labels of noisy samples.\nSubsequently, we propose a self-adaptive and class-balanced sample re-weighting\nmechanism to assign different weights to detected noisy samples. Finally, we\nadditionally employ consistency regularization on selected clean samples to\nimprove model generalization performance. Extensive experimental results on\nsynthetic and real-world datasets demonstrate the effectiveness and superiority\nof our proposed method. The source code has been made available at\nhttps://github.com/NUST-Machine-Intelligence-Laboratory/SED.\n","authors":["Mengmeng Sheng","Zeren Sun","Tao Chen","Shuchao Pang","Yucheng Wang","Yazhou Yao"],"pdf_url":"https://arxiv.org/pdf/2407.02778v1.pdf","comment":"accepted by the European Conference on Computer Vision (ECCV), 2024"},{"id":"http://arxiv.org/abs/2406.16449v2","updated":"2024-07-03T03:02:35Z","published":"2024-06-24T08:42:42Z","title":"Evaluating and Analyzing Relationship Hallucinations in LVLMs","summary":" The issue of hallucinations is a prevalent concern in existing Large\nVision-Language Models (LVLMs). Previous efforts have primarily focused on\ninvestigating object hallucinations, which can be easily alleviated by\nintroducing object detectors. However, these efforts neglect hallucinations in\ninter-object relationships, which is essential for visual comprehension. In\nthis work, we introduce R-Bench, a novel benchmark for evaluating Vision\nRelationship Hallucination. R-Bench features image-level questions that focus\non the existence of relationships and instance-level questions that assess\nlocal visual comprehension. We identify three types of relationship\nco-occurrences that lead to hallucinations: relationship-relationship,\nsubject-relationship, and relationship-object. The visual instruction tuning\ndataset's long-tail distribution significantly impacts LVLMs' understanding of\nvisual relationships. Furthermore, our analysis reveals that current LVLMs tend\nto disregard visual content and overly rely on the common sense knowledge of\nLarge Language Models. They also struggle with reasoning about spatial\nrelationships based on contextual information.\n","authors":["Mingrui Wu","Jiayi Ji","Oucheng Huang","Jiale Li","Yuhang Wu","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2406.16449v2.pdf","comment":"ICML2024; Project Page:https://github.com/mrwu-mac/R-Bench"},{"id":"http://arxiv.org/abs/2407.02772v1","updated":"2024-07-03T03:01:43Z","published":"2024-07-03T03:01:43Z","title":"Automatic gradient descent with generalized Newton's method","summary":" We propose the generalized Newton's method (GeN) -- a Hessian-informed\napproach that applies to any optimizer such as SGD and Adam, and covers the\nNewton-Raphson method as a sub-case. Our method automatically and dynamically\nselects the learning rate that accelerates the convergence, without the\nintensive tuning of the learning rate scheduler. In practice, out method is\neasily implementable, since it only requires additional forward passes with\nalmost zero computational overhead (in terms of training time and memory cost),\nif the overhead is amortized over many iterations. We present extensive\nexperiments on language and vision tasks (e.g. GPT and ResNet) to showcase that\nGeN optimizers match the state-of-the-art performance, which was achieved with\ncarefully tuned learning rate schedulers. Code to be released at\n\\url{https://github.com/ShiyunXu/AutoGeN}.\n","authors":["Zhiqi Bu","Shiyun Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02769v1","updated":"2024-07-03T02:57:14Z","published":"2024-07-03T02:57:14Z","title":"Fine-Grained Scene Image Classification with Modality-Agnostic Adapter","summary":" When dealing with the task of fine-grained scene image classification, most\nprevious works lay much emphasis on global visual features when doing\nmulti-modal feature fusion. In other words, models are deliberately designed\nbased on prior intuitions about the importance of different modalities. In this\npaper, we present a new multi-modal feature fusion approach named MAA\n(Modality-Agnostic Adapter), trying to make the model learn the importance of\ndifferent modalities in different cases adaptively, without giving a prior\nsetting in the model architecture. More specifically, we eliminate the modal\ndifferences in distribution and then use a modality-agnostic Transformer\nencoder for a semantic-level feature fusion. Our experiments demonstrate that\nMAA achieves state-of-the-art results on benchmarks by applying the same\nmodalities with previous methods. Besides, it is worth mentioning that new\nmodalities can be easily added when using MAA and further boost the\nperformance. Code is available at https://github.com/quniLcs/MAA.\n","authors":["Yiqun Wang","Zhao Zhou","Xiangcheng Du","Xingjiao Wu","Yingbin Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2407.02769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02768v1","updated":"2024-07-03T02:54:33Z","published":"2024-07-03T02:54:33Z","title":"Knowledge Transfer with Simulated Inter-Image Erasing for Weakly\n Supervised Semantic Segmentation","summary":" Though adversarial erasing has prevailed in weakly supervised semantic\nsegmentation to help activate integral object regions, existing approaches\nstill suffer from the dilemma of under-activation and over-expansion due to the\ndifficulty in determining when to stop erasing. In this paper, we propose a\n\\textbf{K}nowledge \\textbf{T}ransfer with \\textbf{S}imulated Inter-Image\n\\textbf{E}rasing (KTSE) approach for weakly supervised semantic segmentation to\nalleviate the above problem. In contrast to existing erasing-based methods that\nremove the discriminative part for more object discovery, we propose a\nsimulated inter-image erasing scenario to weaken the original activation by\nintroducing extra object information. Then, object knowledge is transferred\nfrom the anchor image to the consequent less activated localization map to\nstrengthen network localization ability. Considering the adopted bidirectional\nalignment will also weaken the anchor image activation if appropriate\nconstraints are missing, we propose a self-supervised regularization module to\nmaintain the reliable activation in discriminative regions and improve the\ninter-class object boundary recognition for complex images with multiple\ncategories of objects. In addition, we resort to intra-image erasing and\npropose a multi-granularity alignment module to gently enlarge the object\nactivation to boost the object knowledge transfer. Extensive experiments and\nablation studies on PASCAL VOC 2012 and COCO datasets demonstrate the\nsuperiority of our proposed approach. Source codes and models are available at\nhttps://github.com/NUST-Machine-Intelligence-Laboratory/KTSE.\n","authors":["Tao Chen","XiRuo Jiang","Gensheng Pei","Zeren Sun","Yucheng Wang","Yazhou Yao"],"pdf_url":"https://arxiv.org/pdf/2407.02768v1.pdf","comment":"accepted by the European Conference on Computer Vision (ECCV), 2024"},{"id":"http://arxiv.org/abs/2402.18844v2","updated":"2024-07-03T02:52:33Z","published":"2024-02-29T04:30:39Z","title":"Deep learning for 3D human pose estimation and mesh recovery: A survey","summary":" 3D human pose estimation and mesh recovery have attracted widespread research\ninterest in many areas, such as computer vision, autonomous driving, and\nrobotics. Deep learning on 3D human pose estimation and mesh recovery has\nrecently thrived, with numerous methods proposed to address different problems\nin this area. In this paper, to stimulate future research, we present a\ncomprehensive review of recent progress over the past five years in deep\nlearning methods for this area by delving into over 200 references. To the best\nof our knowledge, this survey is arguably the first to comprehensively cover\ndeep learning methods for 3D human pose estimation, including both\nsingle-person and multi-person approaches, as well as human mesh recovery,\nencompassing methods based on explicit models and implicit representations. We\nalso present comparative results on several publicly available datasets,\ntogether with insightful observations and inspiring future research directions.\nA regularly updated project page can be found at\nhttps://github.com/liuyangme/SOTA-3DHPE-HMR.\n","authors":["Yang Liu","Changzhen Qiu","Zhiyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.18844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02763v1","updated":"2024-07-03T02:41:59Z","published":"2024-07-03T02:41:59Z","title":"ADFQ-ViT: Activation-Distribution-Friendly Post-Training Quantization\n for Vision Transformers","summary":" Vision Transformers (ViTs) have exhibited exceptional performance across\ndiverse computer vision tasks, while their substantial parameter size incurs\nsignificantly increased memory and computational demands, impeding effective\ninference on resource-constrained devices. Quantization has emerged as a\npromising solution to mitigate these challenges, yet existing methods still\nsuffer from significant accuracy loss at low-bit. We attribute this issue to\nthe distinctive distributions of post-LayerNorm and post-GELU activations\nwithin ViTs, rendering conventional hardware-friendly quantizers ineffective,\nparticularly in low-bit scenarios. To address this issue, we propose a novel\nframework called Activation-Distribution-Friendly post-training Quantization\nfor Vision Transformers, ADFQ-ViT. Concretely, we introduce the Per-Patch\nOutlier-aware Quantizer to tackle irregular outliers in post-LayerNorm\nactivations. This quantizer refines the granularity of the uniform quantizer to\na per-patch level while retaining a minimal subset of values exceeding a\nthreshold at full-precision. To handle the non-uniform distributions of\npost-GELU activations between positive and negative regions, we design the\nShift-Log2 Quantizer, which shifts all elements to the positive region and then\napplies log2 quantization. Moreover, we present the Attention-score enhanced\nModule-wise Optimization which adjusts the parameters of each quantizer by\nreconstructing errors to further mitigate quantization error. Extensive\nexperiments demonstrate ADFQ-ViT provides significant improvements over various\nbaselines in image classification, object detection, and instance segmentation\ntasks at 4-bit. Specifically, when quantizing the ViT-B model to 4-bit, we\nachieve a 10.23% improvement in Top-1 accuracy on the ImageNet dataset.\n","authors":["Yanfeng Jiang","Ning Sun","Xueshuo Xie","Fei Yang","Tao Li"],"pdf_url":"https://arxiv.org/pdf/2407.02763v1.pdf","comment":"28 pages,9 figures"},{"id":"http://arxiv.org/abs/2406.02559v2","updated":"2024-07-03T02:38:39Z","published":"2024-04-18T03:53:33Z","title":"ShadowRefiner: Towards Mask-free Shadow Removal via Fast Fourier\n Transformer","summary":" Shadow-affected images often exhibit pronounced spatial discrepancies in\ncolor and illumination, consequently degrading various vision applications\nincluding object detection and segmentation systems. To effectively eliminate\nshadows in real-world images while preserving intricate details and producing\nvisually compelling outcomes, we introduce a mask-free Shadow Removal and\nRefinement network (ShadowRefiner) via Fast Fourier Transformer. Specifically,\nthe Shadow Removal module in our method aims to establish effective mappings\nbetween shadow-affected and shadow-free images via spatial and frequency\nrepresentation learning. To mitigate the pixel misalignment and further improve\nthe image quality, we propose a novel Fast-Fourier Attention based Transformer\n(FFAT) architecture, where an innovative attention mechanism is designed for\nmeticulous refinement. Our method wins the championship in the Perceptual Track\nand achieves the second best performance in the Fidelity Track of NTIRE 2024\nImage Shadow Removal Challenge. Besides, comprehensive experiment result also\ndemonstrate the compelling effectiveness of our proposed method. The code is\npublicly available: https://github.com/movingforward100/Shadow_R.\n","authors":["Wei Dong","Han Zhou","Yuqiong Tian","Jingke Sun","Xiaohong Liu","Guangtao Zhai","Jun Chen"],"pdf_url":"https://arxiv.org/pdf/2406.02559v2.pdf","comment":"Accepted by CVPR workshop 2024 (NTIRE 2024); Corrected references"},{"id":"http://arxiv.org/abs/2407.02758v1","updated":"2024-07-03T02:23:33Z","published":"2024-07-03T02:23:33Z","title":"Differential Encoding for Improved Representation Learning over Graphs","summary":" Combining the message-passing paradigm with the global attention mechanism\nhas emerged as an effective framework for learning over graphs. The\nmessage-passing paradigm and the global attention mechanism fundamentally\ngenerate node embeddings based on information aggregated from a node's local\nneighborhood or from the whole graph. The most basic and commonly used\naggregation approach is to take the sum of information from a node's local\nneighbourhood or from the whole graph. However, it is unknown if the dominant\ninformation is from a node itself or from the node's neighbours (or the rest of\nthe graph nodes). Therefore, there exists information lost at each layer of\nembedding generation, and this information lost could be accumulated and become\nmore serious when more layers are used in the model. In this paper, we present\na differential encoding method to address the issue of information lost. The\nidea of our method is to encode the differential representation between the\ninformation from a node's neighbours (or the rest of the graph nodes) and that\nfrom the node itself. The obtained differential encoding is then combined with\nthe original aggregated local or global representation to generate the updated\nnode embedding. By integrating differential encodings, the representational\nability of generated node embeddings is improved. The differential encoding\nmethod is empirically evaluated on different graph tasks on seven benchmark\ndatasets. The results show that it is a general method that improves the\nmessage-passing update and the global attention update, advancing the\nstate-of-the-art performance for graph representation learning on these\ndatasets.\n","authors":["Haimin Zhang","Jiahao Xia","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02038v2","updated":"2024-07-03T01:50:48Z","published":"2024-07-02T08:10:37Z","title":"Camera-LiDAR Cross-modality Gait Recognition","summary":" Gait recognition is a crucial biometric identification technique.\nCamera-based gait recognition has been widely applied in both research and\nindustrial fields. LiDAR-based gait recognition has also begun to evolve most\nrecently, due to the provision of 3D structural information. However, in\ncertain applications, cameras fail to recognize persons, such as in low-light\nenvironments and long-distance recognition scenarios, where LiDARs work well.\nOn the other hand, the deployment cost and complexity of LiDAR systems limit\nits wider application. Therefore, it is essential to consider cross-modality\ngait recognition between cameras and LiDARs for a broader range of\napplications. In this work, we propose the first cross-modality gait\nrecognition framework between Camera and LiDAR, namely CL-Gait. It employs a\ntwo-stream network for feature embedding of both modalities. This poses a\nchallenging recognition task due to the inherent matching between 3D and 2D\ndata, exhibiting significant modality discrepancy. To align the feature spaces\nof the two modalities, i.e., camera silhouettes and LiDAR points, we propose a\ncontrastive pre-training strategy to mitigate modality discrepancy. To make up\nfor the absence of paired camera-LiDAR data for pre-training, we also introduce\na strategy for generating data on a large scale. This strategy utilizes\nmonocular depth estimated from single RGB images and virtual cameras to\ngenerate pseudo point clouds for contrastive pre-training. Extensive\nexperiments show that the cross-modality gait recognition is very challenging\nbut still contains potential and feasibility with our proposed model and\npre-training strategy. To the best of our knowledge, this is the first work to\naddress cross-modality gait recognition.\n","authors":["Wenxuan Guo","Yingping Liang","Zhiyu Pan","Ziheng Xi","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02038v2.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2305.17401v4","updated":"2024-07-03T01:46:32Z","published":"2023-05-27T07:59:49Z","title":"A Framework For Refining Text Classification and Object Recognition from\n Academic Articles","summary":" With the widespread use of the internet, it has become increasingly crucial\nto extract specific information from vast amounts of academic articles\nefficiently. Data mining techniques are generally employed to solve this issue.\nHowever, data mining for academic articles is challenging since it requires\nautomatically extracting specific patterns in complex and unstructured layout\ndocuments. Current data mining methods for academic articles employ\nrule-based(RB) or machine learning(ML) approaches. However, using rule-based\nmethods incurs a high coding cost for complex typesetting articles. On the\nother hand, simply using machine learning methods requires annotation work for\ncomplex content types within the paper, which can be costly. Furthermore, only\nusing machine learning can lead to cases where patterns easily recognized by\nrule-based methods are mistakenly extracted. To overcome these issues, from the\nperspective of analyzing the standard layout and typesetting used in the\nspecified publication, we emphasize implementing specific methods for specific\ncharacteristics in academic articles. We have developed a novel Text Block\nRefinement Framework (TBRF), a machine learning and rule-based scheme hybrid.\nWe used the well-known ACL proceeding articles as experimental data for the\nvalidation experiment. The experiment shows that our approach achieved over 95%\nclassification accuracy and 90% detection accuracy for tables and figures.\n","authors":["Jinghong Li","Koichi Ota","Wen Gu","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2305.17401v4.pdf","comment":"This paper has been accepted at 'The International Symposium on\n Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)'"},{"id":"http://arxiv.org/abs/2309.05180v2","updated":"2024-07-03T01:43:41Z","published":"2023-09-11T00:32:26Z","title":"What's color got to do with it? Face recognition in grayscale","summary":" State-of-the-art deep CNN face matchers are typically created using extensive\ntraining sets of color face images. Our study reveals that such matchers attain\nvirtually identical accuracy when trained on either grayscale or color versions\nof the training set, even when the evaluation is done using color test images.\nFurthermore, we demonstrate that shallower models, lacking the capacity to\nmodel complex representations, rely more heavily on low-level features such as\nthose associated with color. As a result, they display diminished accuracy when\ntrained with grayscale images. We then consider possible causes for deeper CNN\nface matchers \"not seeing color\". Popular web-scraped face datasets actually\nhave 30 to 60% of their identities with one or more grayscale images. We\nanalyze whether this grayscale element in the training set impacts the accuracy\nachieved, and conclude that it does not. We demonstrate that using only\ngrayscale images for both training and testing achieves accuracy comparable to\nthat achieved using only color images for deeper models. This holds true for\nboth real and synthetic training datasets. HSV color space, which separates\nchroma and luma information, does not improve the network's learning about\ncolor any more than in the RGB color space. We then show that the skin region\nof an individual's images in a web-scraped training set exhibits significant\nvariation in their mapping to color space. This suggests that color carries\nlimited identity-specific information. We also show that when the first\nconvolution layer is restricted to a single filter, models learn a grayscale\nconversion filter and pass a grayscale version of the input color image to the\nnext layer. Finally, we demonstrate that leveraging the lower per-image storage\nfor grayscale to increase the number of images in the training set can improve\naccuracy of the face recognition model.\n","authors":["Aman Bhatta","Domingo Mery","Haiyu Wu","Joyce Annan","Micheal C. King","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2309.05180v2.pdf","comment":"This is replacement version of the previous arxiv submission:\n 2309.05180 (Our Deep CNN Face Matchers Have Developed Achromatopsia). The\n past version is published in CVPRW and available in IEEE proceedings. This\n submitted version is an extension of the conference paper"},{"id":"http://arxiv.org/abs/2407.02744v1","updated":"2024-07-03T01:37:56Z","published":"2024-07-03T01:37:56Z","title":"Highly Accelerated MRI via Implicit Neural Representation Guided\n Posterior Sampling of Diffusion Models","summary":" Reconstructing high-fidelity magnetic resonance (MR) images from\nunder-sampled k-space is a commonly used strategy to reduce scan time. The\nposterior sampling of diffusion models based on the real measurement data holds\nsignificant promise of improved reconstruction accuracy. However, traditional\nposterior sampling methods often lack effective data consistency guidance,\nleading to inaccurate and unstable reconstructions. Implicit neural\nrepresentation (INR) has emerged as a powerful paradigm for solving inverse\nproblems by modeling a signal's attributes as a continuous function of spatial\ncoordinates. In this study, we present a novel posterior sampler for diffusion\nmodels using INR, named DiffINR. The INR-based component incorporates both the\ndiffusion prior distribution and the MRI physical model to ensure high data\nfidelity. DiffINR demonstrates superior performance on experimental datasets\nwith remarkable accuracy, even under high acceleration factors (up to R=12 in\nsingle-channel reconstruction). Notably, our proposed framework can be a\ngeneralizable framework to solve inverse problems in other medical imaging\ntasks.\n","authors":["Jiayue Chu","Chenhe Du","Xiyue Lin","Yuyao Zhang","Hongjiang Wei"],"pdf_url":"https://arxiv.org/pdf/2407.02744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02264v2","updated":"2024-07-03T01:24:37Z","published":"2024-07-02T13:40:56Z","title":"SOAF: Scene Occlusion-aware Neural Acoustic Field","summary":" This paper tackles the problem of novel view audio-visual synthesis along an\narbitrary trajectory in an indoor scene, given the audio-video recordings from\nother known trajectories of the scene. Existing methods often overlook the\neffect of room geometry, particularly wall occlusion to sound propagation,\nmaking them less accurate in multi-room environments. In this work, we propose\na new approach called Scene Occlusion-aware Acoustic Field (SOAF) for accurate\nsound generation. Our approach derives a prior for sound energy field using\ndistance-aware parametric sound-propagation modelling and then transforms it\nbased on scene transmittance learned from the input video. We extract features\nfrom the local acoustic field centred around the receiver using a Fibonacci\nSphere to generate binaural audio for novel views with a direction-aware\nattention mechanism. Extensive experiments on the real dataset RWAVS and the\nsynthetic dataset SoundSpaces demonstrate that our method outperforms previous\nstate-of-the-art techniques in audio generation. Project page:\nhttps://github.com/huiyu-gao/SOAF/.\n","authors":["Huiyu Gao","Jiahao Ma","David Ahmedt-Aristizabal","Chuong Nguyen","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02264v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02738v1","updated":"2024-07-03T01:20:56Z","published":"2024-07-03T01:20:56Z","title":"ZEAL: Surgical Skill Assessment with Zero-shot Tool Inference Using\n Unified Foundation Model","summary":" Surgical skill assessment is paramount for ensuring patient safety and\nenhancing surgical outcomes. This study addresses the need for efficient and\nobjective evaluation methods by introducing ZEAL (surgical skill assessment\nwith Zero-shot surgical tool segmentation with a unifiEd foundAtion modeL).\nZEAL uses segmentation masks of surgical instruments obtained through a unified\nfoundation model for proficiency assessment. Through zero-shot inference with\ntext prompts, ZEAL predicts segmentation masks, capturing essential features of\nboth instruments and surroundings. Utilizing sparse convolutional neural\nnetworks and segmentation masks, ZEAL extracts feature vectors for foreground\n(instruments) and background. Long Short-Term Memory (LSTM) networks encode\ntemporal dynamics, modeling sequential data and dependencies in surgical\nvideos. Combining LSTM-encoded vectors, ZEAL produces a surgical skill score,\noffering an objective measure of proficiency. Comparative analysis with\nconventional methods using open datasets demonstrates ZEAL's superiority,\naffirming its potential in advancing surgical training and evaluation. This\ninnovative approach to surgical skill assessment addresses challenges in\ntraditional supervised learning techniques, paving the way for enhanced\nsurgical care quality and patient outcomes.\n","authors":["Satoshi Kondo"],"pdf_url":"https://arxiv.org/pdf/2407.02738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02730v1","updated":"2024-07-03T00:59:03Z","published":"2024-07-03T00:59:03Z","title":"MedVH: Towards Systematic Evaluation of Hallucination for Large Vision\n Language Models in the Medical Context","summary":" Large Vision Language Models (LVLMs) have recently achieved superior\nperformance in various tasks on natural image and text data, which inspires a\nlarge amount of studies for LVLMs fine-tuning and training. Despite their\nadvancements, there has been scant research on the robustness of these models\nagainst hallucination when fine-tuned on smaller datasets. In this study, we\nintroduce a new benchmark dataset, the Medical Visual Hallucination Test\n(MedVH), to evaluate the hallucination of domain-specific LVLMs. MedVH\ncomprises five tasks to evaluate hallucinations in LVLMs within the medical\ncontext, which includes tasks for comprehensive understanding of textual and\nvisual input, as well as long textual response generation. Our extensive\nexperiments with both general and medical LVLMs reveal that, although medical\nLVLMs demonstrate promising performance on standard medical tasks, they are\nparticularly susceptible to hallucinations, often more so than the general\nmodels, raising significant concerns about the reliability of these\ndomain-specific models. For medical LVLMs to be truly valuable in real-world\napplications, they must not only accurately integrate medical knowledge but\nalso maintain robust reasoning abilities to prevent hallucination. Our work\npaves the way for future evaluations of these studies.\n","authors":["Zishan Gu","Changchang Yin","Fenglin Liu","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06302v2","updated":"2024-07-03T00:51:21Z","published":"2024-06-10T14:18:56Z","title":"Unveiling the Safety of GPT-4o: An Empirical Study using Jailbreak\n Attacks","summary":" The recent release of GPT-4o has garnered widespread attention due to its\npowerful general capabilities. While its impressive performance is widely\nacknowledged, its safety aspects have not been sufficiently explored. Given the\npotential societal impact of risky content generated by advanced generative AI\nsuch as GPT-4o, it is crucial to rigorously evaluate its safety. In response to\nthis question, this paper for the first time conducts a rigorous evaluation of\nGPT-4o against jailbreak attacks. Specifically, this paper adopts a series of\nmulti-modal and uni-modal jailbreak attacks on 4 commonly used benchmarks\nencompassing three modalities (ie, text, speech, and image), which involves the\noptimization of over 4,000 initial text queries and the analysis and\nstatistical evaluation of nearly 8,000+ response on GPT-4o. Our extensive\nexperiments reveal several novel observations: (1) In contrast to the previous\nversion (such as GPT-4V), GPT-4o has enhanced safety in the context of text\nmodality jailbreak; (2) The newly introduced audio modality opens up new attack\nvectors for jailbreak attacks on GPT-4o; (3) Existing black-box multimodal\njailbreak attack methods are largely ineffective against GPT-4o and GPT-4V.\nThese findings provide critical insights into the safety implications of GPT-4o\nand underscore the need for robust alignment guardrails in large models. Our\ncode is available at \\url{https://github.com/NY1024/Jailbreak_GPT4o}.\n","authors":["Zonghao Ying","Aishan Liu","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2406.06302v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2406.18470v2","updated":"2024-07-03T13:32:34Z","published":"2024-06-26T16:28:24Z","title":"UFRec: Integrating Uniformity and Frequency to Enhance Sequential\n Recommendations","summary":" Effective representation learning in sequential recommendation systems is\npivotal for precisely capturing user interaction patterns and enhancing\nrecommendation accuracy. Nonetheless, current methodologies largely focus on\nitem-to-item transitions, frequently overlooking the time intervals between\ninteractions, which are integral to understanding behavior pattern shifts.\nMoreover, critical interaction attributes like item frequency are often\nneglected. Our research indicates that sequences with more consistent time\nintervals and items with higher interaction frequency result in superior\npredictive performance. In contrast, sequences with non-uniform intervals\ncontribute to user interest drift, and infrequently interacted items are\nchallenging to model due to sparse data, posing unique challenges that existing\nmethods fail to adequately address. In this study, we introduce UFRec, an\ninnovative bidirectional enhancement method for sequential recommendations.\nUFRec harnesses sequence uniformity and item frequency to boost performance,\nparticularly improving the representation of non-uniform sequences and\nless-frequent items. These two components synergistically enhance each other,\ndriving holistic performance optimization in intricate sequential\nrecommendation scenarios. Additionally, we introduce a multidimensional time\nmodule to further augment adaptability. To the best of our knowledge, UFRec is\nthe pioneering method to exploit the properties of uniformity and frequency for\nfeature augmentation. Through comparisons with eleven state-of-the-art models\nacross four datasets, we demonstrate that UFRec significantly surpasses current\nleading models.\n","authors":["Yang Liu","Yitong Wang","Chenyue Feng"],"pdf_url":"https://arxiv.org/pdf/2406.18470v2.pdf","comment":"15 pages, 8 figures, for source code, see\n https://github.com/Linxi000/UniRec"},{"id":"http://arxiv.org/abs/2404.03788v2","updated":"2024-07-03T10:52:01Z","published":"2024-04-04T19:59:47Z","title":"Understanding Language Modeling Paradigm Adaptations in Recommender\n Systems: Lessons Learned and Open Challenges","summary":" The emergence of Large Language Models (LLMs) has achieved tremendous success\nin the field of Natural Language Processing owing to diverse training paradigms\nthat empower LLMs to effectively capture intricate linguistic patterns and\nsemantic representations. In particular, the recent \"pre-train, prompt and\npredict\" training paradigm has attracted significant attention as an approach\nfor learning generalizable models with limited labeled data. In line with this\nadvancement, these training paradigms have recently been adapted to the\nrecommendation domain and are seen as a promising direction in both academia\nand industry. This half-day tutorial aims to provide a thorough understanding\nof extracting and transferring knowledge from pre-trained models learned\nthrough different training paradigms to improve recommender systems from\nvarious perspectives, such as generality, sparsity, effectiveness and\ntrustworthiness. In this tutorial, we first introduce the basic concepts and a\ngeneric architecture of the language modeling paradigm for recommendation\npurposes. Then, we focus on recent advancements in adapting LLM-related\ntraining strategies and optimization objectives for different recommendation\ntasks. After that, we will systematically introduce ethical issues in LLM-based\nrecommender systems and discuss possible approaches to assessing and mitigating\nthem. We will also summarize the relevant datasets, evaluation metrics, and an\nempirical study on the recommendation performance of training paradigms.\nFinally, we will conclude the tutorial with a discussion of open challenges and\nfuture directions.\n","authors":["Lemei Zhang","Peng Liu","Yashar Deldjoo","Yong Zheng","Jon Atle Gulla"],"pdf_url":"https://arxiv.org/pdf/2404.03788v2.pdf","comment":"Tutorial held at the 27th European Conference on Artificial\n Intelligence (ECAI) in Santiago de Compostela, Spain, on October 19-24, 2024"},{"id":"http://arxiv.org/abs/2406.14017v2","updated":"2024-07-03T10:00:26Z","published":"2024-06-20T06:21:56Z","title":"EAGER: Two-Stream Generative Recommender with Behavior-Semantic\n Collaboration","summary":" Generative retrieval has recently emerged as a promising approach to\nsequential recommendation, framing candidate item retrieval as an\nautoregressive sequence generation problem. However, existing generative\nmethods typically focus solely on either behavioral or semantic aspects of item\ninformation, neglecting their complementary nature and thus resulting in\nlimited effectiveness. To address this limitation, we introduce EAGER, a novel\ngenerative recommendation framework that seamlessly integrates both behavioral\nand semantic information. Specifically, we identify three key challenges in\ncombining these two types of information: a unified generative architecture\ncapable of handling two feature types, ensuring sufficient and independent\nlearning for each type, and fostering subtle interactions that enhance\ncollaborative information utilization. To achieve these goals, we propose (1) a\ntwo-stream generation architecture leveraging a shared encoder and two separate\ndecoders to decode behavior tokens and semantic tokens with a confidence-based\nranking strategy; (2) a global contrastive task with summary tokens to achieve\ndiscriminative decoding for each type of information; and (3) a semantic-guided\ntransfer task designed to implicitly promote cross-interactions through\nreconstruction and estimation objectives. We validate the effectiveness of\nEAGER on four public benchmarks, demonstrating its superior performance\ncompared to existing methods.\n","authors":["Ye Wang","Jiahao Xun","Minjie Hong","Jieming Zhu","Tao Jin","Wang Lin","Haoyuan Li","Linjun Li","Yan Xia","Zhou Zhao","Zhenhua Dong"],"pdf_url":"https://arxiv.org/pdf/2406.14017v2.pdf","comment":"Accepted by KDD 2024. Code available at\n https://reczoo.github.io/EAGER"},{"id":"http://arxiv.org/abs/2404.00621v2","updated":"2024-07-03T09:53:45Z","published":"2024-03-31T09:20:30Z","title":"Multimodal Pretraining, Adaptation, and Generation for Recommendation: A\n Survey","summary":" Personalized recommendation serves as a ubiquitous channel for users to\ndiscover information tailored to their interests. However, traditional\nrecommendation models primarily rely on unique IDs and categorical features for\nuser-item matching, potentially overlooking the nuanced essence of raw item\ncontents across multiple modalities such as text, image, audio, and video. This\nunderutilization of multimodal data poses a limitation to recommender systems,\nespecially in multimedia services like news, music, and short-video platforms.\nThe recent advancements in large multimodal models offer new opportunities and\nchallenges in developing content-aware recommender systems. This survey seeks\nto provide a comprehensive exploration of the latest advancements and future\ntrajectories in multimodal pretraining, adaptation, and generation techniques,\nas well as their applications in enhancing recommender systems. Furthermore, we\ndiscuss current open challenges and opportunities for future research in this\ndynamic domain. We believe that this survey, alongside the curated resources,\nwill provide valuable insights to inspire further advancements in this evolving\nlandscape.\n","authors":["Qijiong Liu","Jieming Zhu","Yanting Yang","Quanyu Dai","Zhaocheng Du","Xiao-Ming Wu","Zhou Zhao","Rui Zhang","Zhenhua Dong"],"pdf_url":"https://arxiv.org/pdf/2404.00621v2.pdf","comment":"Accepted by KDD 2024. See our tutorial materials at\n https://mmrec.github.io"},{"id":"http://arxiv.org/abs/2407.02883v1","updated":"2024-07-03T07:58:20Z","published":"2024-07-03T07:58:20Z","title":"CoIR: A Comprehensive Benchmark for Code Information Retrieval Models","summary":" Despite the substantial success of Information Retrieval (IR) in various NLP\ntasks, most IR systems predominantly handle queries and corpora in natural\nlanguage, neglecting the domain of code retrieval. Code retrieval is critically\nimportant yet remains under-explored, with existing methods and benchmarks\ninadequately representing the diversity of code in various domains and tasks.\nAddressing this gap, we present \\textbf{\\name} (\\textbf{Co}de\n\\textbf{I}nformation \\textbf{R}etrieval Benchmark), a robust and comprehensive\nbenchmark specifically designed to assess code retrieval capabilities. \\name\ncomprises \\textbf{ten} meticulously curated code datasets, spanning\n\\textbf{eight} distinctive retrieval tasks across \\textbf{seven} diverse\ndomains. We first discuss the construction of \\name and its diverse dataset\ncomposition. Further, we evaluate nine widely used retrieval models using\n\\name, uncovering significant difficulties in performing code retrieval tasks\neven with state-of-the-art systems. To facilitate easy adoption and integration\nwithin existing research workflows, \\name has been developed as a user-friendly\nPython framework, readily installable via pip. It shares same data schema as\nother popular benchmarks like MTEB and BEIR, enabling seamless cross-benchmark\nevaluations. Through \\name, we aim to invigorate research in the code retrieval\ndomain, providing a versatile benchmarking tool that encourages further\ndevelopment and exploration of code retrieval systems\\footnote{\\url{\nhttps://github.com/CoIR-team/coir}}.\n","authors":["Xiangyang Li","Kuicai Dong","Yi Quan Lee","Wei Xia","Yichun Yin","Hao Zhang","Yong Liu","Yasheng Wang","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2407.02883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00072v2","updated":"2024-07-03T06:54:16Z","published":"2024-06-21T08:52:11Z","title":"Pistis-RAG: A Scalable Cascading Framework Towards Content-Centric\n Retrieval-Augmented Generation","summary":" In Greek mythology, Pistis symbolized good faith, trust, and reliability.\nDrawing inspiration from these principles, Pistis-RAG is a scalable multi-stage\nframework designed to address the challenges of large-scale retrieval-augmented\ngeneration (RAG) systems. This framework consists of distinct stages: matching,\npre-ranking, ranking, reasoning, and aggregating. Each stage contributes to\nnarrowing the search space, prioritizing semantically relevant documents,\naligning with the large language model's (LLM) preferences, supporting complex\nchain-of-thought (CoT) methods, and combining information from multiple\nsources.\n Our ranking stage introduces a significant innovation by recognizing that\nsemantic relevance alone may not lead to improved generation quality, due to\nthe sensitivity of the few-shot prompt order, as noted in previous research.\nThis critical aspect is often overlooked in current RAG frameworks.\n We argue that the alignment issue between LLMs and external knowledge ranking\nmethods is tied to the model-centric paradigm dominant in RAG systems. We\npropose a content-centric approach, emphasizing seamless integration between\nLLMs and external information sources to optimize content transformation for\nspecific tasks.\n Our novel ranking stage is designed specifically for RAG systems,\nincorporating principles of information retrieval while considering the unique\nbusiness scenarios reflected in LLM preferences and user feedback. We simulated\nfeedback signals on the MMLU benchmark, resulting in a 9.3% performance\nimprovement. Our model and code will be open-sourced on GitHub. Additionally,\nexperiments on real-world, large-scale data validate the scalability of our\nframework.\n","authors":["Yu Bai","Yukai Miao","Li Chen","Dan Li","Yanyu Ren","Hongtao Xie","Ce Yang","Xuhui Cai"],"pdf_url":"https://arxiv.org/pdf/2407.00072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02839v1","updated":"2024-07-03T06:34:56Z","published":"2024-07-03T06:34:56Z","title":"CRUISE on Quantum Computing for Feature Selection in Recommender Systems","summary":" Using Quantum Computers to solve problems in Recommender Systems that\nclassical computers cannot address is a worthwhile research topic. In this\npaper, we use Quantum Annealers to address the feature selection problem in\nrecommendation algorithms. This feature selection problem is a Quadratic\nUnconstrained Binary Optimization(QUBO) problem. By incorporating\nCounterfactual Analysis, we significantly improve the performance of the\nitem-based KNN recommendation algorithm compared to using pure Mutual\nInformation. Extensive experiments have demonstrated that the use of\nCounterfactual Analysis holds great promise for addressing such problems.\n","authors":["Jiayang Niu","Jie Li","Ke Deng","Yongli Ren"],"pdf_url":"https://arxiv.org/pdf/2407.02839v1.pdf","comment":"accepted by QuantumCLEF 2024"},{"id":"http://arxiv.org/abs/2407.02833v1","updated":"2024-07-03T06:20:31Z","published":"2024-07-03T06:20:31Z","title":"LANE: Logic Alignment of Non-tuning Large Language Models and Online\n Recommendation Systems for Explainable Reason Generation","summary":" The explainability of recommendation systems is crucial for enhancing user\ntrust and satisfaction. Leveraging large language models (LLMs) offers new\nopportunities for comprehensive recommendation logic generation. However, in\nexisting related studies, fine-tuning LLM models for recommendation tasks\nincurs high computational costs and alignment issues with existing systems,\nlimiting the application potential of proven proprietary/closed-source LLM\nmodels, such as GPT-4. In this work, our proposed effective strategy LANE\naligns LLMs with online recommendation systems without additional LLMs tuning,\nreducing costs and improving explainability. This innovative approach addresses\nkey challenges in integrating language models with recommendation systems while\nfully utilizing the capabilities of powerful proprietary models. Specifically,\nour strategy operates through several key components: semantic embedding, user\nmulti-preference extraction using zero-shot prompting, semantic alignment, and\nexplainable recommendation generation using Chain of Thought (CoT) prompting.\nBy embedding item titles instead of IDs and utilizing multi-head attention\nmechanisms, our approach aligns the semantic features of user preferences with\nthose of candidate items, ensuring coherent and user-aligned recommendations.\nSufficient experimental results including performance comparison, questionnaire\nvoting, and visualization cases prove that our method can not only ensure\nrecommendation performance, but also provide easy-to-understand and reasonable\nrecommendation logic.\n","authors":["Hongke Zhao","Songming Zheng","Likang Wu","Bowen Yu","Jing Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02793v1","updated":"2024-07-03T03:42:13Z","published":"2024-07-03T03:42:13Z","title":"Learning Positional Attention for Sequential Recommendation","summary":" Self-attention-based networks have achieved remarkable performance in\nsequential recommendation tasks. A crucial component of these models is\npositional encoding. In this study, we delve into the learned positional\nembedding, demonstrating that it often captures the distance between tokens.\nBuilding on this insight, we introduce novel attention models that directly\nlearn positional relations. Extensive experiments reveal that our proposed\nmodels, \\textbf{PARec} and \\textbf{FPARec} outperform previous\nself-attention-based approaches.Our code is available at the link for anonymous\nreview: https://anonymous.4open.science/ r/FPARec-2C55/\n","authors":["Fan Luo","Juan Zhang","Shenghui Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02732v1","updated":"2024-07-03T01:09:36Z","published":"2024-07-03T01:09:36Z","title":"Supporting Cross-language Cross-project Bug Localization Using\n Pre-trained Language Models","summary":" Automatically locating a bug within a large codebase remains a significant\nchallenge for developers. Existing techniques often struggle with\ngeneralizability and deployment due to their reliance on application-specific\ndata and large model sizes. This paper proposes a novel pre-trained language\nmodel (PLM) based technique for bug localization that transcends project and\nlanguage boundaries. Our approach leverages contrastive learning to enhance the\nrepresentation of bug reports and source code. It then utilizes a novel ranking\napproach that combines commit messages and code segments. Additionally, we\nintroduce a knowledge distillation technique that reduces model size for\npractical deployment without compromising performance.\n This paper presents several key benefits. By incorporating code segment and\ncommit message analysis alongside traditional file-level examination, our\ntechnique achieves better bug localization accuracy. Furthermore, our model\nexcels at generalizability - trained on code from various projects and\nlanguages, it can effectively identify bugs in unseen codebases. To address\ncomputational limitations, we propose a CPU-compatible solution. In essence,\nproposed work presents a highly effective, generalizable, and efficient bug\nlocalization technique with the potential to real-world deployment.\n","authors":["Mahinthan Chandramohan","Dai Quoc Nguyen","Padmanabhan Krishnan","Jovan Jancic"],"pdf_url":"https://arxiv.org/pdf/2407.02732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08487v4","updated":"2024-07-03T23:28:10Z","published":"2023-08-15T05:48:44Z","title":"Temporal Interest Network for User Response Prediction","summary":" User response prediction is essential in industrial recommendation systems,\nsuch as online display advertising. Among all the features in recommendation\nmodels, user behaviors are among the most critical. Many works have revealed\nthat a user's behavior reflects her interest in the candidate item, owing to\nthe semantic or temporal correlation between behaviors and the candidate. While\nthe literature has individually examined each of these correlations,\nresearchers have yet to analyze them in combination, that is, the\nsemantic-temporal correlation. We empirically measure this correlation and\nobserve intuitive yet robust patterns. We then examine several popular user\ninterest models and find that, surprisingly, none of them learn such\ncorrelation well.\n To fill this gap, we propose a Temporal Interest Network (TIN) to capture the\nsemantic-temporal correlation simultaneously between behaviors and the target.\nWe achieve this by incorporating target-aware temporal encoding, in addition to\nsemantic encoding, to represent behaviors and the target. Furthermore, we\nconduct explicit 4-way interaction by deploying target-aware attention and\ntarget-aware representation to capture both semantic and temporal correlation.\nWe conduct comprehensive evaluations on two popular public datasets, and our\nproposed TIN outperforms the best-performing baselines by 0.43% and 0.29% on\nGAUC, respectively. During online A/B testing in Tencent's advertising\nplatform, TIN achieves 1.65% cost lift and 1.93% GMV lift over the base model.\nIt has been successfully deployed in production since October 2023, serving the\nWeChat Moments traffic. We have released our code at\nhttps://github.com/zhouxy1003/TIN.\n","authors":["Haolin Zhou","Junwei Pan","Xinyi Zhou","Xihua Chen","Jie Jiang","Xiaofeng Gao","Guihai Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08487v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.06297v3","updated":"2024-07-03T19:44:31Z","published":"2022-05-12T18:19:24Z","title":"Improving Sequential Query Recommendation with Immediate User Feedback","summary":" We propose an algorithm for next query recommendation in interactive data\nexploration settings, like knowledge discovery for information gathering. The\nstate-of-the-art query recommendation algorithms are based on\nsequence-to-sequence learning approaches that exploit historical interaction\ndata. Due to the supervision involved in the learning process, such approaches\nfail to adapt to immediate user feedback. We propose to augment the\ntransformer-based causal language models for query recommendations to adapt to\nthe immediate user feedback using multi-armed bandit (MAB) framework. We\nconduct a large-scale experimental study using log files from a popular online\nliterature discovery service and demonstrate that our algorithm improves the\nper-round regret substantially, with respect to the state-of-the-art\ntransformer-based query recommendation models, which do not make use of\nimmediate user feedback. Our data model and source code are available at\nhttps://github.com/shampp/exp3_ss\n","authors":["Shameem A Puthiya Parambath","Christos Anagnostopoulos","Roderick Murray-Smith"],"pdf_url":"https://arxiv.org/pdf/2205.06297v3.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.03321v1","updated":"2024-07-03T17:59:53Z","published":"2024-07-03T17:59:53Z","title":"Planetarium: A Rigorous Benchmark for Translating Text to Structured\n Planning Languages","summary":" Many recent works have explored using language models for planning problems.\nOne line of research focuses on translating natural language descriptions of\nplanning tasks into structured planning languages, such as the planning domain\ndefinition language (PDDL). While this approach is promising, accurately\nmeasuring the quality of generated PDDL code continues to pose significant\nchallenges. First, generated PDDL code is typically evaluated using planning\nvalidators that check whether the problem can be solved with a planner. This\nmethod is insufficient because a language model might generate valid PDDL code\nthat does not align with the natural language description of the task. Second,\nexisting evaluation sets often have natural language descriptions of the\nplanning task that closely resemble the ground truth PDDL, reducing the\nchallenge of the task. To bridge this gap, we introduce \\benchmarkName, a\nbenchmark designed to evaluate language models' ability to generate PDDL code\nfrom natural language descriptions of planning tasks. We begin by creating a\nPDDL equivalence algorithm that rigorously evaluates the correctness of PDDL\ncode generated by language models by flexibly comparing it against a ground\ntruth PDDL. Then, we present a dataset of $132,037$ text-to-PDDL pairs across\n13 different tasks, with varying levels of difficulty. Finally, we evaluate\nseveral API-access and open-weight language models that reveal this task's\ncomplexity. For example, $87.6\\%$ of the PDDL problem descriptions generated by\nGPT-4o are syntactically parseable, $82.2\\%$ are valid, solve-able problems,\nbut only $35.1\\%$ are semantically correct, highlighting the need for a more\nrigorous benchmark for this problem.\n","authors":["Max Zuo","Francisco Piedrahita Velez","Xiaochen Li","Michael L. Littman","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2407.03321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03311v1","updated":"2024-07-03T17:54:11Z","published":"2024-07-03T17:54:11Z","title":"Value-Penalized Auxiliary Control from Examples for Learning without\n Rewards or Demonstrations","summary":" Learning from examples of success is an appealing approach to reinforcement\nlearning that eliminates many of the disadvantages of using hand-crafted reward\nfunctions or full expert-demonstration trajectories, both of which can be\ndifficult to acquire, biased, or suboptimal. However, learning from examples\nalone dramatically increases the exploration challenge, especially for complex\ntasks. This work introduces value-penalized auxiliary control from examples\n(VPACE); we significantly improve exploration in example-based control by\nadding scheduled auxiliary control and examples of auxiliary tasks.\nFurthermore, we identify a value-calibration problem, where policy value\nestimates can exceed their theoretical limits based on successful data. We\nresolve this problem, which is exacerbated by learning auxiliary tasks, through\nthe addition of an above-success-level value penalty. Across three simulated\nand one real robotic manipulation environment, and 21 different main tasks, we\nshow that our approach substantially improves learning efficiency. Videos,\ncode, and datasets are available at https://papers.starslab.ca/vpace.\n","authors":["Trevor Ablett","Bryan Chan","Jayce Haoran Wang","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2407.03311v1.pdf","comment":"Submitted to the Conference on Robot Learning (CoRL'24), Munich,\n Germany, Nov. 6-9, 2024"},{"id":"http://arxiv.org/abs/2407.03310v1","updated":"2024-07-03T17:53:44Z","published":"2024-07-03T17:53:44Z","title":"Universal Length Generalization with Turing Programs","summary":" Length generalization refers to the ability to extrapolate from short\ntraining sequences to long test sequences and is a challenge for current large\nlanguage models. While prior work has proposed some architecture or data format\nchanges to achieve length generalization, these proposals typically apply to a\nlimited set of tasks. Building on prior scratchpad and Chain-of-Thought (CoT)\ntechniques, we propose Turing Programs, a novel CoT strategy that decomposes an\nalgorithmic task into steps mimicking the computation of a Turing Machine. This\nframework is both universal, as it can accommodate any algorithmic task, and\nsimple, requiring only copying text from the context with small modifications.\nWe show that by using Turing Programs, we obtain robust length generalization\non a range of algorithmic tasks: addition, multiplication and in-context SGD.\nWe then demonstrate that transformers achieve length generalization on random\nTuring Programs, suggesting that length generalization is possible for any\nalgorithmic task. Finally, we theoretically prove that transformers can\nimplement Turing Programs, constructing a simple RASP (Weiss et al.) program\nthat simulates an arbitrary Turing machine.\n","authors":["Kaiying Hou","David Brandfonbrener","Sham Kakade","Samy Jelassi","Eran Malach"],"pdf_url":"https://arxiv.org/pdf/2407.03310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03300v1","updated":"2024-07-03T17:42:46Z","published":"2024-07-03T17:42:46Z","title":"DisCo-Diff: Enhancing Continuous Diffusion Models with Discrete Latents","summary":" Diffusion models (DMs) have revolutionized generative learning. They utilize\na diffusion process to encode data into a simple Gaussian distribution.\nHowever, encoding a complex, potentially multimodal data distribution into a\nsingle continuous Gaussian distribution arguably represents an unnecessarily\nchallenging learning problem. We propose Discrete-Continuous Latent Variable\nDiffusion Models (DisCo-Diff) to simplify this task by introducing\ncomplementary discrete latent variables. We augment DMs with learnable discrete\nlatents, inferred with an encoder, and train DM and encoder end-to-end.\nDisCo-Diff does not rely on pre-trained networks, making the framework\nuniversally applicable. The discrete latents significantly simplify learning\nthe DM's complex noise-to-data mapping by reducing the curvature of the DM's\ngenerative ODE. An additional autoregressive transformer models the\ndistribution of the discrete latents, a simple step because DisCo-Diff requires\nonly few discrete variables with small codebooks. We validate DisCo-Diff on toy\ndata, several image synthesis tasks as well as molecular docking, and find that\nintroducing discrete latents consistently improves model performance. For\nexample, DisCo-Diff achieves state-of-the-art FID scores on class-conditioned\nImageNet-64/128 datasets with ODE sampler.\n","authors":["Yilun Xu","Gabriele Corso","Tommi Jaakkola","Arash Vahdat","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2407.03300v1.pdf","comment":"project page: https://research.nvidia.com/labs/lpr/disco-diff"},{"id":"http://arxiv.org/abs/2310.05988v2","updated":"2024-07-03T17:42:25Z","published":"2023-10-07T19:35:07Z","title":"Dual Latent State Learning: Exploiting Regional Network Similarities for\n QoS Prediction","summary":" Individual objects, whether users or services, within a specific region often\nexhibit similar network states due to their shared origin from the same city or\nautonomous system (AS). Despite this regional network similarity, many existing\ntechniques overlook its potential, resulting in subpar performance arising from\nchallenges such as data sparsity and label imbalance. In this paper, we\nintroduce the regional-based dual latent state learning network(R2SL), a novel\ndeep learning framework designed to overcome the pitfalls of traditional\nindividual object-based prediction techniques in Quality of Service (QoS)\nprediction. Unlike its predecessors, R2SL captures the nuances of regional\nnetwork behavior by deriving two distinct regional network latent states: the\ncity-network latent state and the AS-network latent state. These states are\nconstructed utilizing aggregated data from common regions rather than\nindividual object data. Furthermore, R2SL adopts an enhanced Huber loss\nfunction that adjusts its linear loss component, providing a remedy for\nprevalent label imbalance issues. To cap off the prediction process, a\nmulti-scale perception network is leveraged to interpret the integrated feature\nmap, a fusion of regional network latent features and other pertinent\ninformation, ultimately accomplishing the QoS prediction. Through rigorous\ntesting on real-world QoS datasets, R2SL demonstrates superior performance\ncompared to prevailing state-of-the-art methods. Our R2SL approach ushers in an\ninnovative avenue for precise QoS predictions by fully harnessing the regional\nnetwork similarities inherent in objects.\n","authors":["Ziliang Wang","Xiaohong Zhang","Kechi Zhang","Ze Shi Li","Meng Yan"],"pdf_url":"https://arxiv.org/pdf/2310.05988v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16008v2","updated":"2024-07-03T17:40:00Z","published":"2024-06-23T04:35:42Z","title":"Found in the Middle: Calibrating Positional Attention Bias Improves Long\n Context Utilization","summary":" Large language models (LLMs), even when specifically trained to process long\ninput contexts, struggle to capture relevant information located in the middle\nof their input. This phenomenon has been known as the lost-in-the-middle\nproblem. In this work, we make three contributions. First, we set out to\nunderstand the factors that cause this phenomenon. In doing so, we establish a\nconnection between lost-in-the-middle to LLMs' intrinsic attention bias: LLMs\nexhibit a U-shaped attention bias where the tokens at the beginning and at the\nend of its input receive higher attention, regardless of their relevance.\nSecond, we mitigate this positional bias through a calibration mechanism,\nfound-in-the-middle, that allows the model to attend to contexts faithfully\naccording to their relevance, even though when they are in the middle. Third,\nwe show found-in-the-middle not only achieves better performance in locating\nrelevant information within a long context, but also eventually leads to\nimproved retrieval-augmented generation (RAG) performance across various tasks,\noutperforming existing methods by up to 15 percentage points. These findings\nopen up future directions in understanding LLM attention bias and its potential\nconsequences.\n","authors":["Cheng-Yu Hsieh","Yung-Sung Chuang","Chun-Liang Li","Zifeng Wang","Long T. Le","Abhishek Kumar","James Glass","Alexander Ratner","Chen-Yu Lee","Ranjay Krishna","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2406.16008v2.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2303.15975v3","updated":"2024-07-03T17:36:19Z","published":"2023-03-28T13:47:16Z","title":"Large-scale Pre-trained Models are Surprisingly Strong in Incremental\n Novel Class Discovery","summary":" Discovering novel concepts in unlabelled datasets and in a continuous manner\nis an important desideratum of lifelong learners. In the literature such\nproblems have been partially addressed under very restricted settings, where\nnovel classes are learned by jointly accessing a related labelled set (e.g.,\nNCD) or by leveraging only a supervisedly pre-trained model (e.g., class-iNCD).\nIn this work we challenge the status quo in class-iNCD and propose a learning\nparadigm where class discovery occurs continuously and truly unsupervisedly,\nwithout needing any related labelled set. In detail, we propose to exploit the\nricher priors from strong self-supervised pre-trained models (PTM). To this\nend, we propose simple baselines, composed of a frozen PTM backbone and a\nlearnable linear classifier, that are not only simple to implement but also\nresilient under longer learning scenarios. We conduct extensive empirical\nevaluation on a multitude of benchmarks and show the effectiveness of our\nproposed baselines when compared with sophisticated state-of-the-art methods.\nThe code is open source.\n","authors":["Mingxuan Liu","Subhankar Roy","Zhun Zhong","Nicu Sebe","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2303.15975v3.pdf","comment":"Accepted as a conference paper to ICPR 2024"},{"id":"http://arxiv.org/abs/2405.09062v4","updated":"2024-07-03T17:33:58Z","published":"2024-05-15T03:26:01Z","title":"Naturalistic Music Decoding from EEG Data via Latent Diffusion Models","summary":" In this article, we explore the potential of using latent diffusion models, a\nfamily of powerful generative models, for the task of reconstructing\nnaturalistic music from electroencephalogram (EEG) recordings. Unlike simpler\nmusic with limited timbres, such as MIDI-generated tunes or monophonic pieces,\nthe focus here is on intricate music featuring a diverse array of instruments,\nvoices, and effects, rich in harmonics and timbre. This study represents an\ninitial foray into achieving general music reconstruction of high-quality using\nnon-invasive EEG data, employing an end-to-end training approach directly on\nraw data without the need for manual pre-processing and channel selection. We\ntrain our models on the public NMED-T dataset and perform quantitative\nevaluation proposing neural embedding-based metrics. We additionally perform\nsong classification based on the generated tracks. Our work contributes to the\nongoing research in neural decoding and brain-computer interfaces, offering\ninsights into the feasibility of using EEG data for complex auditory\ninformation reconstruction.\n","authors":["Emilian Postolache","Natalia Polouliakh","Hiroaki Kitano","Akima Connelly","Emanuele Rodolà","Luca Cosmo","Taketo Akama"],"pdf_url":"https://arxiv.org/pdf/2405.09062v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03294v1","updated":"2024-07-03T17:28:17Z","published":"2024-07-03T17:28:17Z","title":"Vertex Exchange Method for a Class of Quadratic Programming Problems","summary":" A vertex exchange method is proposed for solving the strongly convex\nquadratic program subject to the generalized simplex constraint. We conduct\nrigorous convergence analysis for the proposed algorithm and demonstrate its\nessential roles in solving some important classes of constrained convex\noptimization. To get a feasible initial point to execute the algorithm, we also\npresent and analyze a highly efficient semismooth Newton method for computing\nthe projection onto the generalized simplex. The excellent practical\nperformance of the proposed algorithms is demonstrated by a set of extensive\nnumerical experiments. Our theoretical and numerical results further motivate\nthe potential applications of the considered model and the proposed algorithms.\n","authors":["Ling Liang","Kim-Chuan Toh","Haizhao Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03294v1.pdf","comment":"32 pages, 5 tables"},{"id":"http://arxiv.org/abs/2407.03289v1","updated":"2024-07-03T17:22:33Z","published":"2024-07-03T17:22:33Z","title":"Correlated Privacy Mechanisms for Differentially Private Distributed\n Mean Estimation","summary":" Differentially private distributed mean estimation (DP-DME) is a fundamental\nbuilding block in privacy-preserving federated learning, where a central server\nestimates the mean of $d$-dimensional vectors held by $n$ users while ensuring\n$(\\epsilon,\\delta)$-DP. Local differential privacy (LDP) and distributed DP\nwith secure aggregation (SecAgg) are the most common notions of DP used in\nDP-DME settings with an untrusted server. LDP provides strong resilience to\ndropouts, colluding users, and malicious server attacks, but suffers from poor\nutility. In contrast, SecAgg-based DP-DME achieves an $O(n)$ utility gain over\nLDP in DME, but requires increased communication and computation overheads and\ncomplex multi-round protocols to handle dropouts and malicious attacks. In this\nwork, we propose CorDP-DME, a novel DP-DME mechanism that spans the gap between\nDME with LDP and distributed DP, offering a favorable balance between utility\nand resilience to dropout and collusion. CorDP-DME is based on correlated\nGaussian noise, ensuring DP without the perfect conditional privacy guarantees\nof SecAgg-based approaches. We provide an information-theoretic analysis of\nCorDP-DME, and derive theoretical guarantees for utility under any given\nprivacy parameters and dropout/colluding user thresholds. Our results\ndemonstrate that (anti) correlated Gaussian DP mechanisms can significantly\nimprove utility in mean estimation tasks compared to LDP -- even in adversarial\nsettings -- while maintaining better resilience to dropouts and attacks\ncompared to distributed DP.\n","authors":["Sajani Vithana","Viveck R. Cadambe","Flavio P. Calmon","Haewon Jeong"],"pdf_url":"https://arxiv.org/pdf/2407.03289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03266v1","updated":"2024-07-03T16:56:08Z","published":"2024-07-03T16:56:08Z","title":"Do Quantum Neural Networks have Simplicity Bias?","summary":" One hypothesis for the success of deep neural networks (DNNs) is that they\nare highly expressive, which enables them to be applied to many problems, and\nthey have a strong inductive bias towards solutions that are simple, known as\nsimplicity bias, which allows them to generalise well on unseen data because\nmost real-world data is structured (i.e. simple). In this work, we explore the\ninductive bias and expressivity of quantum neural networks (QNNs), which gives\nus a way to compare their performance to those of DNNs. Our results show that\nit is possible to have simplicity bias with certain QNNs, but we prove that\nthis type of QNN limits the expressivity of the QNN. We also show that it is\npossible to have QNNs with high expressivity, but they either have no inductive\nbias or a poor inductive bias and result in a worse generalisation performance\ncompared to DNNs. We demonstrate that an artificial (restricted) inductive bias\ncan be produced by intentionally restricting the expressivity of a QNN. Our\nresults suggest a bias-expressivity tradeoff. Our conclusion is that the QNNs\nwe studied can not generally offer an advantage over DNNs, because these QNNs\neither have a poor inductive bias or poor expressivity compared to DNNs.\n","authors":["Jessica Pointing"],"pdf_url":"https://arxiv.org/pdf/2407.03266v1.pdf","comment":"9 pages, 42 pages with appendices"},{"id":"http://arxiv.org/abs/2406.08740v2","updated":"2024-07-03T16:54:44Z","published":"2024-06-13T02:00:13Z","title":"An AI Architecture with the Capability to Explain Recognition Results","summary":" Explainability is needed to establish confidence in machine learning results.\nSome explainable methods take a post hoc approach to explain the weights of\nmachine learning models, others highlight areas of the input contributing to\ndecisions. These methods do not adequately explain decisions, in plain terms.\nExplainable property-based systems have been shown to provide explanations in\nplain terms, however, they have not performed as well as leading unexplainable\nmachine learning methods. This research focuses on the importance of metrics to\nexplainability and contributes two methods yielding performance gains. The\nfirst method introduces a combination of explainable and unexplainable flows,\nproposing a metric to characterize explainability of a decision. The second\nmethod compares classic metrics for estimating the effectiveness of neural\nnetworks in the system, posing a new metric as the leading performer. Results\nfrom the new methods and examples from handwritten datasets are presented.\n","authors":["Paul Whitten","Francis Wolff","Chris Papachristou"],"pdf_url":"https://arxiv.org/pdf/2406.08740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03262v1","updated":"2024-07-03T16:49:28Z","published":"2024-07-03T16:49:28Z","title":"Nearly Linear Sparsification of $\\ell_p$ Subspace Approximation","summary":" The $\\ell_p$ subspace approximation problem is an NP-hard low rank\napproximation problem that generalizes the median hyperplane problem ($p = 1$),\nprincipal component analysis ($p = 2$), and the center hyperplane problem ($p =\n\\infty$). A popular approach to cope with the NP-hardness of this problem is to\ncompute a strong coreset, which is a small weighted subset of the input points\nwhich simultaneously approximates the cost of every $k$-dimensional subspace,\ntypically to $(1+\\varepsilon)$ relative error for a small constant\n$\\varepsilon$.\n We obtain the first algorithm for constructing a strong coreset for $\\ell_p$\nsubspace approximation with a nearly optimal dependence on the rank parameter\n$k$, obtaining a nearly linear bound of $\\tilde\nO(k)\\mathrm{poly}(\\varepsilon^{-1})$ for $p<2$ and $\\tilde\nO(k^{p/2})\\mathrm{poly}(\\varepsilon^{-1})$ for $p>2$. Prior constructions\neither achieved a similar size bound but produced a coreset with a modification\nof the original points [SW18, FKW21], or produced a coreset of the original\npoints but lost $\\mathrm{poly}(k)$ factors in the coreset size [HV20, WY23].\n Our techniques also lead to the first nearly optimal online strong coresets\nfor $\\ell_p$ subspace approximation with similar bounds as the offline setting,\nresolving a problem of [WY23]. All prior approaches lose $\\mathrm{poly}(k)$\nfactors in this setting, even when allowed to modify the original points.\n","authors":["David P. Woodruff","Taisuke Yasuda"],"pdf_url":"https://arxiv.org/pdf/2407.03262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03261v1","updated":"2024-07-03T16:45:45Z","published":"2024-07-03T16:45:45Z","title":"Magnetic Hysteresis Modeling with Neural Operators","summary":" Hysteresis modeling is crucial to comprehend the behavior of magnetic\ndevices, facilitating optimal designs. Hitherto, deep learning-based methods\nemployed to model hysteresis, face challenges in generalizing to novel input\nmagnetic fields. This paper addresses the generalization challenge by proposing\nneural operators for modeling constitutive laws that exhibit magnetic\nhysteresis by learning a mapping between magnetic fields. In particular, two\nprominent neural operators -- deep operator network and Fourier neural operator\n-- are employed to predict novel first-order reversal curves and minor loops,\nwhere novel means they are not used to train the model. In addition, a\nrate-independent Fourier neural operator is proposed to predict material\nresponses at sampling rates different from those used during training to\nincorporate the rate-independent characteristics of magnetic hysteresis. The\npresented numerical experiments demonstrate that neural operators efficiently\nmodel magnetic hysteresis, outperforming the traditional neural recurrent\nmethods on various metrics and generalizing to novel magnetic fields. The\nfindings emphasize the advantages of using neural operators for modeling\nhysteresis under varying magnetic conditions, underscoring their importance in\ncharacterizing magnetic material based devices.\n","authors":["Abhishek Chandra","Bram Daniels","Mitrofan Curti","Koen Tiels","Elena A. Lomonova"],"pdf_url":"https://arxiv.org/pdf/2407.03261v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.03257v1","updated":"2024-07-03T16:38:57Z","published":"2024-07-03T16:38:57Z","title":"Modern Neighborhood Components Analysis: A Deep Tabular Baseline Two\n Decades Later","summary":" The growing success of deep learning in various domains has prompted\ninvestigations into its application to tabular data, where deep models have\nshown promising results compared to traditional tree-based methods. In this\npaper, we revisit Neighborhood Component Analysis (NCA), a classic tabular\nprediction method introduced in 2004, designed to learn a linear projection\nthat captures semantic similarities between instances. We find that minor\nmodifications, such as adjustments to the learning objectives and the\nintegration of deep learning architectures, significantly enhance NCA's\nperformance, enabling it to surpass most modern deep tabular models.\nAdditionally, we introduce a stochastic neighbor sampling strategy that\nimproves both the efficiency and predictive accuracy of our proposed ModernNCA\n-- sampling only a subset of neighbors during training, while utilizing the\nentire neighborhood during inference. Extensive experiments demonstrate that\nour ModernNCA achieves state-of-the-art results in both classification and\nregression tasks across various tabular datasets, outperforming both tree-based\nand other deep tabular models, while also reducing training time and model\nsize.\n","authors":["Han-Jia Ye","Huai-Hong Yin","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2407.03257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16793v5","updated":"2024-07-03T16:38:17Z","published":"2024-06-24T16:56:41Z","title":"Adam-mini: Use Fewer Learning Rates To Gain More","summary":" We propose Adam-mini, an optimizer that achieves on-par or better performance\nthan AdamW with 45% to 50% less memory footprint. Adam-mini reduces memory by\ncutting down the learning rate resources in Adam (i.e., $1/\\sqrt{v}$). We find\nthat $\\geq$ 90% of these learning rates in $v$ could be harmlessly removed if\nwe (1) carefully partition the parameters into blocks following our proposed\nprinciple on Hessian structure; (2) assign a single but good learning rate to\neach parameter block. We further find that, for each of these parameter blocks,\nthere exists a single high-quality learning rate that can outperform Adam,\nprovided that sufficient resources are available to search it out. We then\nprovide one cost-effective way to find good learning rates and propose\nAdam-mini. Empirically, we verify that Adam-mini performs on par or better than\nAdamW on various language models sized from 125M to 7B for pre-training,\nsupervised fine-tuning, and RLHF. The reduced memory footprint of Adam-mini\nalso alleviates communication overheads among GPUs and CPUs, thereby increasing\nthroughput. For instance, Adam-mini achieves 49.6% higher throughput than AdamW\nwhen pre-training Llama2-7B on $2\\times$ A800-80GB GPUs, which saves 33%\nwall-clock time for pre-training.\n","authors":["Yushun Zhang","Congliang Chen","Ziniu Li","Tian Ding","Chenwei Wu","Yinyu Ye","Zhi-Quan Luo","Ruoyu Sun"],"pdf_url":"https://arxiv.org/pdf/2406.16793v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17761v2","updated":"2024-07-03T16:33:55Z","published":"2024-06-25T17:45:26Z","title":"CaLMQA: Exploring culturally specific long-form question answering\n across 23 languages","summary":" Large language models (LLMs) are used for long-form question answering\n(LFQA), which requires them to generate paragraph-length answers to complex\nquestions. While LFQA has been well-studied in English, this research has not\nbeen extended to other languages. To bridge this gap, we introduce CaLMQA, a\ncollection of 1.5K complex culturally specific questions spanning 23 languages\nand 51 culturally agnostic questions translated from English into 22 other\nlanguages. We define culturally specific questions as those uniquely or more\nlikely to be asked by people from cultures associated with the question's\nlanguage. We collect naturally-occurring questions from community web forums\nand hire native speakers to write questions to cover under-resourced,\nrarely-studied languages such as Fijian and Kirundi. Our dataset contains\ndiverse, complex questions that reflect cultural topics (e.g. traditions, laws,\nnews) and the language usage of native speakers. We automatically evaluate a\nsuite of open- and closed-source models on CaLMQA by detecting incorrect\nlanguage and token repetitions in answers, and observe that the quality of\nLLM-generated answers degrades significantly for some low-resource languages.\nLastly, we perform human evaluation on a subset of models and languages. Manual\nevaluation reveals that model performance is significantly worse for culturally\nspecific questions than for culturally agnostic questions. Our findings\nhighlight the need for further research in non-English LFQA and provide an\nevaluation framework.\n","authors":["Shane Arora","Marzena Karpinska","Hung-Ting Chen","Ipsita Bhattacharjee","Mohit Iyyer","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2406.17761v2.pdf","comment":"39 pages, 17 figures. Code and data available at\n https://github.com/2015aroras/CaLMQA. Revised argument in section 4, results\n unchanged"},{"id":"http://arxiv.org/abs/2407.03250v1","updated":"2024-07-03T16:29:47Z","published":"2024-07-03T16:29:47Z","title":"When big data actually are low-rank, or entrywise approximation of\n certain function-generated matrices","summary":" The article concerns low-rank approximation of matrices generated by sampling\na smooth function of two $m$-dimensional variables. We refute an argument made\nin the literature that, for a specific class of analytic functions, such\nmatrices admit accurate entrywise approximation of rank that is independent of\n$m$. We provide a theoretical explanation of the numerical results presented in\nsupport of this argument, describing three narrower classes of functions for\nwhich $n \\times n$ function-generated matrices can be approximated within an\nentrywise error of order $\\varepsilon$ with rank $\\mathcal{O}(\\log(n)\n\\varepsilon^{-2} \\mathrm{polylog}(\\varepsilon^{-1}))$ that is independent of\nthe dimension $m$: (i) functions of the inner product of the two variables,\n(ii) functions of the squared Euclidean distance between the variables, and\n(iii) shift-invariant positive-definite kernels. We extend our argument to\nlow-rank tensor-train approximation of tensors generated with functions of the\nmulti-linear product of their $m$-dimensional variables. We discuss our results\nin the context of low-rank approximation of attention in transformer neural\nnetworks.\n","authors":["Stanislav Budzinskiy"],"pdf_url":"https://arxiv.org/pdf/2407.03250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00741v2","updated":"2024-07-03T16:22:15Z","published":"2024-06-30T16:05:31Z","title":"Diffusion Models for Offline Multi-agent Reinforcement Learning with\n Safety Constraints","summary":" In recent advancements in Multi-agent Reinforcement Learning (MARL), its\napplication has extended to various safety-critical scenarios. However, most\nmethods focus on online learning, which presents substantial risks when\ndeployed in real-world settings. Addressing this challenge, we introduce an\ninnovative framework integrating diffusion models within the MARL paradigm.\nThis approach notably enhances the safety of actions taken by multiple agents\nthrough risk mitigation while modeling coordinated action. Our framework is\ngrounded in the Centralized Training with Decentralized Execution (CTDE)\narchitecture, augmented by a Diffusion Model for prediction trajectory\ngeneration. Additionally, we incorporate a specialized algorithm to further\nensure operational safety. We evaluate our model against baselines on the DSRL\nbenchmark. Experiment results demonstrate that our model not only adheres to\nstringent safety constraints but also achieves superior performance compared to\nexisting methodologies. This underscores the potential of our approach in\nadvancing the safety and efficacy of MARL in real-world applications.\n","authors":["Jianuo Huang"],"pdf_url":"https://arxiv.org/pdf/2407.00741v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01601v2","updated":"2024-07-03T16:19:59Z","published":"2024-06-26T11:53:35Z","title":"Unveiling and Controlling Anomalous Attention Distribution in\n Transformers","summary":" With the advent of large models based on the Transformer architecture,\nresearchers have observed an anomalous phenomenon in the Attention\nmechanism--there is a very high attention on the first element, which is\nprevalent across Transformer-based models. It is crucial to understand it for\nthe development of techniques focusing on attention distribution, such as\nKey-Value (KV) Cache compression and infinite extrapolation; however, the\nlatent cause leaves to be unknown. In this paper, we analyze such a phenomenon\nfrom the perspective of waiver phenomenon, which involves reducing the internal\nvalues of certain elements in the sequence, allowing them to absorb excess\nattention without affecting their contribution to information. In specific\nmodels, due to differences in positional encoding and attention patterns, we\nhave found that the selection of waiver elements by the model can be\ncategorized into two methods: positional-encoding-based and\nfeature-distribution-within-elements-based.\n","authors":["Ruiqing Yan","Xingbo Du","Haoyu Deng","Linghan Zheng","Qiuzhuang Sun","Jifang Hu","Yuhang Shao","Penghao Jiang","Jinrong Jiang","Lian Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.01601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19136v2","updated":"2024-07-03T16:12:02Z","published":"2024-06-27T12:40:29Z","title":"YZS-model: A Predictive Model for Organic Drug Solubility Based on Graph\n Convolutional Networks and Transformer-Attention","summary":" The accurate prediction of drug molecule solubility is essential for\ndetermining their therapeutic effectiveness and safety, influencing the drug's\nADME processes. Traditional solubility prediction techniques often fail to\ncapture the complex nature of molecular tructures, leading to notable\ndeviations between predictions and actual results. For example, the Discussion\non Advanced Drug-Like Compound Structures. Lusci highlighted issues in\ncapturing crucial cyclic structural information in molecules with ring\nstructures. To overcome this issue, our research introduces a novel deep\nlearning framework combining attention-based transformers, Long Short-Term\nMemory (LSTM) networks, and Graph Convolutional Networks (GCN), aimed at\nenhancing the precision of solubility predictions. Utilizing a training set of\n9,943 compounds and testing on an anticancer compound dataset, our method\nachieved a correlation coefficient ($R^2$) of 0.55 and a Root Mean Square Error\n(RMSE) of 0.59, which outperforms the benchmark models' scores of 0.52 ($R^2$)\nand 0.61 (RMSE). Importantly, in an additional independent test, our model\nsignificantly outperformed the baseline with an RMSE of 1.05 compared to 1.28,\na relative accuracy improvement of 45.9%. This research not only demonstrates\nthe vast potential of deep learning for improving solubility prediction\naccuracy but also offers novel insights for drug design and selection in the\nfuture. Continued efforts will be directed towards optimizing the model\narchitecture and extending its application to better support the drug\ndevelopment process, underscoring the pivotal role of deep learning in drug\ndiscovery.\n","authors":["Chenxu Wang","Haowei Ming","Jian He","Yao Lu"],"pdf_url":"https://arxiv.org/pdf/2406.19136v2.pdf","comment":"18 pages, 12 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.03241v1","updated":"2024-07-03T16:10:50Z","published":"2024-07-03T16:10:50Z","title":"Terrain Classification Enhanced with Uncertainty for Space Exploration\n Robots from Proprioceptive Data","summary":" Terrain Classification is an essential task in space exploration, where\nunpredictable environments are difficult to observe using only exteroceptive\nsensors such as vision. Implementing Neural Network classifiers can have high\nperformance but can be deemed untrustworthy as they lack transparency, which\nmakes them unreliable for taking high-stakes decisions during mission planning.\nWe address this by proposing Neural Networks with Uncertainty Quantification in\nTerrain Classification. We enable our Neural Networks with Monte Carlo Dropout,\nDropConnect, and Flipout in time series-capable architectures using only\nproprioceptive data as input. We use Bayesian Optimization with Hyperband for\nefficient hyperparameter optimization to find optimal models for trustworthy\nterrain classification.\n","authors":["Mariela De Lucas Álvarez","Jichen Guo","Raul Domínguez","Matias Valdenegro-Toro"],"pdf_url":"https://arxiv.org/pdf/2407.03241v1.pdf","comment":"6 pages, 4 figures. LatinX in AI Workshop @ ICML 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2312.16762v3","updated":"2024-07-03T16:04:07Z","published":"2023-12-28T00:49:41Z","title":"Backstepping Neural Operators for $2\\times 2$ Hyperbolic PDEs","summary":" Deep neural network approximation of nonlinear operators, commonly referred\nto as DeepONet, has proven capable of approximating PDE backstepping designs in\nwhich a single Goursat-form PDE governs a single feedback gain function. In\nboundary control of coupled PDEs, coupled Goursat-form PDEs govern two or more\ngain kernels-a PDE structure unaddressed thus far with DeepONet. In this paper,\nwe explore the subject of approximating systems of gain kernel PDEs for\nhyperbolic PDE plants by considering a simple counter-convecting $2\\times 2$\ncoupled system in whose control a $2\\times 2$ kernel PDE system in Goursat form\narises. Engineering applications include oil drilling, the Saint-Venant model\nof shallow water waves, and the Aw-Rascle-Zhang model of stop-and-go\ninstability in congested traffic flow. We establish the continuity of the\nmapping from a total of five plant PDE functional coefficients to the kernel\nPDE solutions, prove the existence of an arbitrarily close DeepONet\napproximation to the kernel PDEs, and ensure that the DeepONet-approximated\ngains guarantee stabilization when replacing the exact backstepping gain\nkernels. Taking into account anti-collocated boundary actuation and sensing,\nour $L^2$-Globally-exponentially stabilizing (GES) approximate gain\nkernel-based output feedback design implies the deep learning of both the\ncontroller's and the observer's gains. Moreover, the encoding of the\noutput-feedback law into DeepONet ensures semi-global practical exponential\nstability (SG-PES). The DeepONet operator speeds up the computation of the\ncontroller gains by multiple orders of magnitude. Its theoretically proven\nstabilizing capability is demonstrated through simulations.\n","authors":["Shanshan Wang","Mamadou Diagne","Miroslav Krstić"],"pdf_url":"https://arxiv.org/pdf/2312.16762v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03234v1","updated":"2024-07-03T16:03:42Z","published":"2024-07-03T16:03:42Z","title":"Self-Evaluation as a Defense Against Adversarial Attacks on LLMs","summary":" When LLMs are deployed in sensitive, human-facing settings, it is crucial\nthat they do not output unsafe, biased, or privacy-violating outputs. For this\nreason, models are both trained and instructed to refuse to answer unsafe\nprompts such as \"Tell me how to build a bomb.\" We find that, despite these\nsafeguards, it is possible to break model defenses simply by appending a space\nto the end of a model's input. In a study of eight open-source models, we\ndemonstrate that this acts as a strong enough attack to cause the majority of\nmodels to generate harmful outputs with very high success rates. We examine the\ncauses of this behavior, finding that the contexts in which single spaces occur\nin tokenized training data encourage models to generate lists when prompted,\noverriding training signals to refuse to answer unsafe requests. Our findings\nunderscore the fragile state of current model alignment and promote the\nimportance of developing more robust alignment methods. Code and data will be\nmade available at https://github.com/Linlt-leon/Adversarial-Alignments.\n","authors":["Hannah Brown","Leon Lin","Kenji Kawaguchi","Michael Shieh"],"pdf_url":"https://arxiv.org/pdf/2407.03234v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.03232v1","updated":"2024-07-03T16:03:10Z","published":"2024-07-03T16:03:10Z","title":"Single Character Perturbations Break LLM Alignment","summary":" When LLMs are deployed in sensitive, human-facing settings, it is crucial\nthat they do not output unsafe, biased, or privacy-violating outputs. For this\nreason, models are both trained and instructed to refuse to answer unsafe\nprompts such as \"Tell me how to build a bomb.\" We find that, despite these\nsafeguards, it is possible to break model defenses simply by appending a space\nto the end of a model's input. In a study of eight open-source models, we\ndemonstrate that this acts as a strong enough attack to cause the majority of\nmodels to generate harmful outputs with very high success rates. We examine the\ncauses of this behavior, finding that the contexts in which single spaces occur\nin tokenized training data encourage models to generate lists when prompted,\noverriding training signals to refuse to answer unsafe requests. Our findings\nunderscore the fragile state of current model alignment and promote the\nimportance of developing more robust alignment methods. Code and data will be\navailable at https://github.com/hannah-aught/space_attack.\n","authors":["Leon Lin","Hannah Brown","Kenji Kawaguchi","Michael Shieh"],"pdf_url":"https://arxiv.org/pdf/2407.03232v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.10167v3","updated":"2024-07-03T15:53:13Z","published":"2023-07-19T17:53:22Z","title":"VITS : Variational Inference Thompson Sampling for contextual bandits","summary":" In this paper, we introduce and analyze a variant of the Thompson sampling\n(TS) algorithm for contextual bandits. At each round, traditional TS requires\nsamples from the current posterior distribution, which is usually intractable.\nTo circumvent this issue, approximate inference techniques can be used and\nprovide samples with distribution close to the posteriors. However, current\napproximate techniques yield to either poor estimation (Laplace approximation)\nor can be computationally expensive (MCMC methods, Ensemble sampling...). In\nthis paper, we propose a new algorithm, Varational Inference Thompson sampling\nVITS, based on Gaussian Variational Inference. This scheme provides powerful\nposterior approximations which are easy to sample from, and is computationally\nefficient, making it an ideal choice for TS. In addition, we show that VITS\nachieves a sub-linear regret bound of the same order in the dimension and\nnumber of round as traditional TS for linear contextual bandit. Finally, we\ndemonstrate experimentally the effectiveness of VITS on both synthetic and real\nworld datasets.\n","authors":["Pierre Clavier","Tom Huix","Alain Durmus"],"pdf_url":"https://arxiv.org/pdf/2307.10167v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01397v3","updated":"2024-07-03T15:52:58Z","published":"2023-05-02T13:16:04Z","title":"Are demographically invariant models and representations in medical\n imaging fair?","summary":" Medical imaging models have been shown to encode information about patient\ndemographics such as age, race, and sex in their latent representation, raising\nconcerns about their potential for discrimination. Here, we ask whether\nrequiring models not to encode demographic attributes is desirable. We point\nout that marginal and class-conditional representation invariance imply the\nstandard group fairness notions of demographic parity and equalized odds,\nrespectively. In addition, however, they require matching the risk\ndistributions, thus potentially equalizing away important group differences.\nEnforcing the traditional fairness notions directly instead does not entail\nthese strong constraints. Moreover, representationally invariant models may\nstill take demographic attributes into account for deriving predictions,\nimplying unequal treatment - in fact, achieving representation invariance may\nrequire doing so. In theory, this can be prevented using counterfactual notions\nof (individual) fairness or invariance. We caution, however, that properly\ndefining medical image counterfactuals with respect to demographic attributes\nis fraught with challenges. Finally, we posit that encoding demographic\nattributes may even be advantageous if it enables learning a task-specific\nencoding of demographic features that does not rely on social constructs such\nas 'race' and 'gender.' We conclude that demographically invariant\nrepresentations are neither necessary nor sufficient for fairness in medical\nimaging. Models may need to encode demographic attributes, lending further\nurgency to calls for comprehensive model fairness assessments in terms of\npredictive performance across diverse patient groups.\n","authors":["Eike Petersen","Enzo Ferrante","Melanie Ganz","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2305.01397v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14380v2","updated":"2024-07-03T15:40:49Z","published":"2024-06-20T14:53:26Z","title":"Estimating Treatment Effects under Recommender Interference: A\n Structured Neural Networks Approach","summary":" Recommender systems are essential for content-sharing platforms by curating\npersonalized content. To evaluate updates to recommender systems targeting\ncontent creators, platforms frequently rely on creator-side randomized\nexperiments. The treatment effect measures the change in outcomes when a new\nalgorithm is implemented compared to the status quo. We show that the standard\ndifference-in-means estimator can lead to biased estimates due to recommender\ninterference that arises when treated and control creators compete for\nexposure. We propose a \"recommender choice model\" that describes which item\ngets exposed from a pool containing both treated and control items. By\ncombining a structural choice model with neural networks, this framework\ndirectly models the interference pathway while accounting for rich\nviewer-content heterogeneity. We construct a debiased estimator of the\ntreatment effect and prove it is $\\sqrt n$-consistent and asymptotically normal\nwith potentially correlated samples. We validate our estimator's empirical\nperformance with a field experiment on Weixin short-video platform. In addition\nto the standard creator-side experiment, we conduct a costly double-sided\nrandomization design to obtain a benchmark estimate free from interference\nbias. We show that the proposed estimator yields results comparable to the\nbenchmark, whereas the standard difference-in-means estimator can exhibit\nsignificant bias and even produce reversed signs.\n","authors":["Ruohan Zhan","Shichao Han","Yuchen Hu","Zhenling Jiang"],"pdf_url":"https://arxiv.org/pdf/2406.14380v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15724v2","updated":"2024-07-03T15:40:39Z","published":"2022-11-28T19:17:31Z","title":"Malign Overfitting: Interpolation Can Provably Preclude Invariance","summary":" Learned classifiers should often possess certain invariance properties meant\nto encourage fairness, robustness, or out-of-distribution generalization.\nHowever, multiple recent works empirically demonstrate that common\ninvariance-inducing regularizers are ineffective in the over-parameterized\nregime, in which classifiers perfectly fit (i.e. interpolate) the training\ndata. This suggests that the phenomenon of \"benign overfitting\", in which\nmodels generalize well despite interpolating, might not favorably extend to\nsettings in which robustness or fairness are desirable.\n In this work we provide a theoretical justification for these observations.\nWe prove that -- even in the simplest of settings -- any interpolating learning\nrule (with arbitrarily small margin) will not satisfy these invariance\nproperties. We then propose and analyze an algorithm that -- in the same\nsetting -- successfully learns a non-interpolating classifier that is provably\ninvariant. We validate our theoretical observations on simulated data and the\nWaterbirds dataset.\n","authors":["Yoav Wald","Gal Yona","Uri Shalit","Yair Carmon"],"pdf_url":"https://arxiv.org/pdf/2211.15724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03211v1","updated":"2024-07-03T15:39:40Z","published":"2024-07-03T15:39:40Z","title":"How Does Quantization Affect Multilingual LLMs?","summary":" Quantization techniques are widely used to improve inference speed and\ndeployment of large language models. While a wide body of work examines the\nimpact of quantized LLMs on English tasks, none have examined the effect of\nquantization across languages. We conduct a thorough analysis of quantized\nmultilingual LLMs, focusing on their performance across languages and at\nvarying scales. We use automatic benchmarks, LLM-as-a-Judge methods, and human\nevaluation, finding that (1) harmful effects of quantization are apparent in\nhuman evaluation, and automatic metrics severely underestimate the detriment: a\n1.7% average drop in Japanese across automatic tasks corresponds to a 16.0%\ndrop reported by human evaluators on realistic prompts; (2) languages are\ndisparately affected by quantization, with non-Latin script languages impacted\nworst; and (3) challenging tasks such as mathematical reasoning degrade\nfastest. As the ability to serve low-compute models is critical for wide global\nadoption of NLP technologies, our results urge consideration of multilingual\nperformance as a key evaluation criterion for efficient models.\n","authors":["Kelly Marchisio","Saurabh Dash","Hongyu Chen","Dennis Aumiller","Ahmet Üstün","Sara Hooker","Sebastian Ruder"],"pdf_url":"https://arxiv.org/pdf/2407.03211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03210v1","updated":"2024-07-03T15:38:57Z","published":"2024-07-03T15:38:57Z","title":"Combining AI Control Systems and Human Decision Support via Robustness\n and Criticality","summary":" AI-enabled capabilities are reaching the requisite level of maturity to be\ndeployed in the real world, yet do not always make correct or safe decisions.\nOne way of addressing these concerns is to leverage AI control systems\nalongside and in support of human decisions, relying on the AI control system\nin safe situations while calling on a human co-decider for critical situations.\nWe extend a methodology for adversarial explanations (AE) to state-of-the-art\nreinforcement learning frameworks, including MuZero. Multiple improvements to\nthe base agent architecture are proposed. We demonstrate how this technology\nhas two applications: for intelligent decision tools and to enhance training /\nlearning frameworks. In a decision support context, adversarial explanations\nhelp a user make the correct decision by highlighting those contextual factors\nthat would need to change for a different AI-recommended decision. As another\nbenefit of adversarial explanations, we show that the learned AI control system\ndemonstrates robustness against adversarial tampering. Additionally, we\nsupplement AE by introducing strategically similar autoencoders (SSAs) to help\nusers identify and understand all salient factors being considered by the AI\nsystem. In a training / learning framework, this technology can improve both\nthe AI's decisions and explanations through human interaction. Finally, to\nidentify when AI decisions would most benefit from human oversight, we tie this\ncombined system to our prior art on statistically verified analyses of the\ncriticality of decisions at any point in time.\n","authors":["Walt Woods","Alexander Grushin","Simon Khan","Alvaro Velasquez"],"pdf_url":"https://arxiv.org/pdf/2407.03210v1.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.00201v2","updated":"2024-07-03T15:37:54Z","published":"2024-06-28T19:13:48Z","title":"Deconvolving Complex Neuronal Networks into Interpretable Task-Specific\n Connectomes","summary":" Task-specific functional MRI (fMRI) images provide excellent modalities for\nstudying the neuronal basis of cognitive processes. We use fMRI data to\nformulate and solve the problem of deconvolving task-specific aggregate\nneuronal networks into a set of basic building blocks called canonical\nnetworks, to use these networks for functional characterization, and to\ncharacterize the physiological basis of these responses by mapping them to\nregions of the brain. Our results show excellent task-specificity of canonical\nnetworks, i.e., the expression of a small number of canonical networks can be\nused to accurately predict tasks; generalizability across cohorts, i.e.,\ncanonical networks are conserved across diverse populations, studies, and\nacquisition protocols; and that canonical networks have strong anatomical and\nphysiological basis. From a methods perspective, the problem of identifying\nthese canonical networks poses challenges rooted in the high dimensionality,\nsmall sample size, acquisition variability, and noise. Our deconvolution\ntechnique is based on non-negative matrix factorization (NMF) that identifies\ncanonical networks as factors of a suitably constructed matrix. We demonstrate\nthat our method scales to large datasets, yields stable and accurate factors,\nand is robust to noise.\n","authors":["Yifan Wang","Vikram Ravindra","Ananth Grama"],"pdf_url":"https://arxiv.org/pdf/2407.00201v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.03195v1","updated":"2024-07-03T15:26:34Z","published":"2024-07-03T15:26:34Z","title":"Incremental Gauss--Newton Methods with Superlinear Convergence Rates","summary":" This paper addresses the challenge of solving large-scale nonlinear equations\nwith H\\\"older continuous Jacobians. We introduce a novel Incremental\nGauss--Newton (IGN) method within explicit superlinear convergence rate, which\noutperforms existing methods that only achieve linear convergence rate. In\nparticular, we formulate our problem by the nonlinear least squares with\nfinite-sum structure, and our method incrementally iterates with the\ninformation of one component in each round. We also provide a mini-batch\nextension to our IGN method that obtains an even faster superlinear convergence\nrate. Furthermore, we conduct numerical experiments to show the advantages of\nthe proposed methods.\n","authors":["Zhiling Zhou","Zhuanghua Liu","Chengchang Liu","Luo Luo"],"pdf_url":"https://arxiv.org/pdf/2407.03195v1.pdf","comment":"37 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.03194v1","updated":"2024-07-03T15:26:02Z","published":"2024-07-03T15:26:02Z","title":"Prediction Instability in Machine Learning Ensembles","summary":" In machine learning ensembles predictions from multiple models are\naggregated. Despite widespread use and strong performance of ensembles in\napplied problems little is known about the mathematical properties of\naggregating models and associated consequences for safe, explainable use of\nsuch models. In this paper we prove a theorem that shows that any ensemble will\nexhibit at least one of the following forms of prediction instability. It will\neither ignore agreement among all underlying models, change its mind when none\nof the underlying models have done so, or be manipulable through inclusion or\nexclusion of options it would never actually predict. As a consequence,\nensemble aggregation procedures will always need to balance the benefits of\ninformation use against the risk of these prediction instabilities. This\nanalysis also sheds light on what specific forms of prediction instability to\nexpect from particular ensemble algorithms; for example popular tree ensembles\nlike random forest, or xgboost will violate basic, intuitive monotonicity and\nfairness properties.\n","authors":["Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2407.03194v1.pdf","comment":"15 pages, uses a modified version of ICML2024.sty"},{"id":"http://arxiv.org/abs/2310.10649v3","updated":"2024-07-03T15:23:42Z","published":"2023-10-16T17:59:54Z","title":"A Computational Framework for Solving Wasserstein Lagrangian Flows","summary":" The dynamical formulation of the optimal transport can be extended through\nvarious choices of the underlying geometry (kinetic energy), and the\nregularization of density paths (potential energy). These combinations yield\ndifferent variational problems (Lagrangians), encompassing many variations of\nthe optimal transport problem such as the Schr\\\"odinger bridge, unbalanced\noptimal transport, and optimal transport with physical constraints, among\nothers. In general, the optimal density path is unknown, and solving these\nvariational problems can be computationally challenging. We propose a novel\ndeep learning based framework approaching all of these problems from a unified\nperspective. Leveraging the dual formulation of the Lagrangians, our method\ndoes not require simulating or backpropagating through the trajectories of the\nlearned dynamics, and does not need access to optimal couplings. We showcase\nthe versatility of the proposed framework by outperforming previous approaches\nfor the single-cell trajectory inference, where incorporating prior knowledge\ninto the dynamics is crucial for correct predictions.\n","authors":["Kirill Neklyudov","Rob Brekelmans","Alexander Tong","Lazar Atanackovic","Qiang Liu","Alireza Makhzani"],"pdf_url":"https://arxiv.org/pdf/2310.10649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06737v2","updated":"2024-07-03T15:15:46Z","published":"2023-10-10T16:07:23Z","title":"Multi-domain improves out-of-distribution and data-limited scenarios for\n medical image analysis","summary":" Current machine learning methods for medical image analysis primarily focus\non developing models tailored for their specific tasks, utilizing data within\ntheir target domain. These specialized models tend to be data-hungry and often\nexhibit limitations in generalizing to out-of-distribution samples. In this\nwork, we show that employing models that incorporate multiple domains instead\nof specialized ones significantly alleviates the limitations observed in\nspecialized models. We refer to this approach as multi-domain model and compare\nits performance to that of specialized models. For this, we introduce the\nincorporation of diverse medical image domains, including different imaging\nmodalities like X-ray, MRI, CT, and ultrasound images, as well as various\nviewpoints such as axial, coronal, and sagittal views. Our findings underscore\nthe superior generalization capabilities of multi-domain models, particularly\nin scenarios characterized by limited data availability and\nout-of-distribution, frequently encountered in healthcare applications. The\nintegration of diverse data allows multi-domain models to utilize information\nacross domains, enhancing the overall outcomes substantially. To illustrate,\nfor organ recognition, multi-domain model can enhance accuracy by up to 8%\ncompared to conventional specialized models.\n","authors":["Ece Ozkan","Xavier Boix"],"pdf_url":"https://arxiv.org/pdf/2310.06737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01910v2","updated":"2024-07-03T15:15:20Z","published":"2024-07-02T03:21:24Z","title":"MG-Verilog: Multi-grained Dataset Towards Enhanced LLM-assisted Verilog\n Generation","summary":" Large Language Models (LLMs) have recently shown promise in streamlining\nhardware design processes by encapsulating vast amounts of domain-specific\ndata. In addition, they allow users to interact with the design processes\nthrough natural language instructions, thus making hardware design more\naccessible to developers. However, effectively leveraging LLMs in hardware\ndesign necessitates providing domain-specific data during inference (e.g.,\nthrough in-context learning), fine-tuning, or pre-training. Unfortunately,\nexisting publicly available hardware datasets are often limited in size,\ncomplexity, or detail, which hinders the effectiveness of LLMs in hardware\ndesign tasks. To address this issue, we first propose a set of criteria for\ncreating high-quality hardware datasets that can effectively enhance\nLLM-assisted hardware design. Based on these criteria, we propose a\nMulti-Grained-Verilog (MG-Verilog) dataset, which encompasses descriptions at\nvarious levels of detail and corresponding code samples. To benefit the broader\nhardware design community, we have developed an open-source infrastructure that\nfacilitates easy access, integration, and extension of the dataset to meet\nspecific project needs. Furthermore, to fully exploit the potential of the\nMG-Verilog dataset, which varies in complexity and detail, we introduce a\nbalanced fine-tuning scheme. This scheme serves as a unique use case to\nleverage the diverse levels of detail provided by the dataset. Extensive\nexperiments demonstrate that the proposed dataset and fine-tuning scheme\nconsistently improve the performance of LLMs in hardware design tasks.\n","authors":["Yongan Zhang","Zhongzhi Yu","Yonggan Fu","Cheng Wan","Yingyan Celine Lin"],"pdf_url":"https://arxiv.org/pdf/2407.01910v2.pdf","comment":"Accepted in ISLAD 2024"},{"id":"http://arxiv.org/abs/2407.03185v1","updated":"2024-07-03T15:07:16Z","published":"2024-07-03T15:07:16Z","title":"Multiple-Resolution Tokenization for Time Series Forecasting with an\n Application to Pricing","summary":" We propose a transformer architecture for time series forecasting with a\nfocus on time series tokenisation and apply it to a real-world prediction\nproblem from the pricing domain. Our architecture aims to learn effective\nrepresentations at many scales across all available data simultaneously. The\nmodel contains a number of novel modules: a differentiated form of time series\npatching which employs multiple resolutions, a multiple-resolution module for\ntime-varying known variables, a mixer-based module for capturing cross-series\ninformation, and a novel output head with favourable scaling to account for the\nincreased number of tokens. We present an application of this model to a real\nworld prediction problem faced by the markdown team at a very large retailer.\nOn the experiments conducted our model outperforms in-house models and the\nselected existing deep learning architectures.\n","authors":["Egon Peršak","Miguel F. Anjos","Sebastian Lautz","Aleksandar Kolev"],"pdf_url":"https://arxiv.org/pdf/2407.03185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03179v1","updated":"2024-07-03T14:59:46Z","published":"2024-07-03T14:59:46Z","title":"Motion meets Attention: Video Motion Prompts","summary":" Videos contain rich spatio-temporal information. Traditional methods for\nextracting motion, used in tasks such as action recognition, often rely on\nvisual contents rather than precise motion features. This phenomenon is\nreferred to as 'blind motion extraction' behavior, which proves inefficient in\ncapturing motions of interest due to a lack of motion-guided cues. Recently,\nattention mechanisms have enhanced many computer vision tasks by effectively\nhighlighting salient visual areas. Inspired by this, we propose using a\nmodified Sigmoid function with learnable slope and shift parameters as an\nattention mechanism to activate and modulate motion signals derived from frame\ndifferencing maps. This approach generates a sequence of attention maps that\nenhance the processing of motion-related video content. To ensure temporally\ncontinuity and smoothness of the attention maps, we apply pair-wise temporal\nattention variation regularization to remove unwanted motions (e.g., noise)\nwhile preserving important ones. We then perform Hadamard product between each\npair of attention maps and the original video frames to highlight the evolving\nmotions of interest over time. These highlighted motions, termed video motion\nprompts, are subsequently used as inputs to the model instead of the original\nvideo frames. We formalize this process as a motion prompt layer and\nincorporate the regularization term into the loss function to learn better\nmotion prompts. This layer serves as an adapter between the model and the video\ndata, bridging the gap between traditional 'blind motion extraction' and the\nextraction of relevant motions of interest.\n","authors":["Qixiang Chen","Lei Wang","Piotr Koniusz","Tom Gedeon"],"pdf_url":"https://arxiv.org/pdf/2407.03179v1.pdf","comment":"Research report"},{"id":"http://arxiv.org/abs/2407.03178v1","updated":"2024-07-03T14:58:40Z","published":"2024-07-03T14:58:40Z","title":"Relating CNN-Transformer Fusion Network for Change Detection","summary":" While deep learning, particularly convolutional neural networks (CNNs), has\nrevolutionized remote sensing (RS) change detection (CD), existing approaches\noften miss crucial features due to neglecting global context and incomplete\nchange learning. Additionally, transformer networks struggle with low-level\ndetails. RCTNet addresses these limitations by introducing \\textbf{(1)} an\nearly fusion backbone to exploit both spatial and temporal features early on,\n\\textbf{(2)} a Cross-Stage Aggregation (CSA) module for enhanced temporal\nrepresentation, \\textbf{(3)} a Multi-Scale Feature Fusion (MSF) module for\nenriched feature extraction in the decoder, and \\textbf{(4)} an Efficient\nSelf-deciphering Attention (ESA) module utilizing transformers to capture\nglobal information and fine-grained details for accurate change detection.\nExtensive experiments demonstrate RCTNet's clear superiority over traditional\nRS image CD methods, showing significant improvement and an optimal balance\nbetween accuracy and computational cost.\n","authors":["Yuhao Gao","Gensheng Pei","Mengmeng Sheng","Zeren Sun","Tao Chen","Yazhou Yao"],"pdf_url":"https://arxiv.org/pdf/2407.03178v1.pdf","comment":"accepted by IEEE Conference on Multimedia Expo"},{"id":"http://arxiv.org/abs/2404.11477v3","updated":"2024-07-03T14:47:09Z","published":"2024-04-17T15:32:58Z","title":"Discovering Nuclear Models from Symbolic Machine Learning","summary":" Numerous phenomenological nuclear models have been proposed to describe\nspecific observables within different regions of the nuclear chart. However,\ndeveloping a unified model that describes the complex behavior of all nuclei\nremains an open challenge. Here, we explore whether novel symbolic Machine\nLearning (ML) can rediscover traditional nuclear physics models or identify\nalternatives with improved simplicity, fidelity, and predictive power. To\naddress this challenge, we developed a Multi-objective Iterated Symbolic\nRegression approach that handles symbolic regressions over multiple target\nobservables, accounts for experimental uncertainties and is robust against\nhigh-dimensional problems. As a proof of principle, we applied this method to\ndescribe the nuclear binding energies and charge radii of light and medium mass\nnuclei. Our approach identified simple analytical relationships based on the\nnumber of protons and neutrons, providing interpretable models with precision\ncomparable to state-of-the-art nuclear models. Additionally, we integrated this\nML-discovered model with an existing complementary model to estimate the limits\nof nuclear stability. These results highlight the potential of symbolic ML to\ndevelop accurate nuclear models and guide our description of complex many-body\nproblems.\n","authors":["Jose M. Munoz","Silviu M. Udrescu","Ronald F. Garcia Ruiz"],"pdf_url":"https://arxiv.org/pdf/2404.11477v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13022v2","updated":"2024-07-03T14:46:52Z","published":"2024-05-15T13:35:43Z","title":"LLMs can learn self-restraint through iterative self-reflection","summary":" In order to be deployed safely, Large Language Models (LLMs) must be capable\nof dynamically adapting their behavior based on their level of knowledge and\nuncertainty associated with specific topics. This adaptive behavior, which we\nrefer to as self-restraint, is non-trivial to teach since it depends on the\ninternal knowledge of an LLM. By default, LLMs are trained to maximize the next\ntoken likelihood, which does not teach the model to modulate its answer based\non its level of uncertainty. In order to learn self-restraint, we devise a\nutility function that can encourage the model to produce responses only when it\nis confident in them. This utility function can be used to score generation of\ndifferent length and abstention. To optimize this function, we introduce\nReSearch, a process of \"self-reflection\" consisting of iterative self-prompting\nand self-evaluation. We use the ReSearch algorithm to generate synthetic data\non which we finetune our models. Compared to their original versions, our\nresulting models generate fewer \\emph{hallucinations} overall at no additional\ninference cost, for both known and unknown topics, as the model learns to\nselectively restrain itself. In addition, our method elegantly incorporates the\nability to abstain by augmenting the samples generated by the model during the\nsearch procedure with an answer expressing abstention.\n","authors":["Alexandre Piché","Aristides Milios","Dzmitry Bahdanau","Chris Pal"],"pdf_url":"https://arxiv.org/pdf/2405.13022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03162v1","updated":"2024-07-03T14:35:35Z","published":"2024-07-03T14:35:35Z","title":"Bunny-VisionPro: Real-Time Bimanual Dexterous Teleoperation for\n Imitation Learning","summary":" Teleoperation is a crucial tool for collecting human demonstrations, but\ncontrolling robots with bimanual dexterous hands remains a challenge. Existing\nteleoperation systems struggle to handle the complexity of coordinating two\nhands for intricate manipulations. We introduce Bunny-VisionPro, a real-time\nbimanual dexterous teleoperation system that leverages a VR headset. Unlike\nprevious vision-based teleoperation systems, we design novel low-cost devices\nto provide haptic feedback to the operator, enhancing immersion. Our system\nprioritizes safety by incorporating collision and singularity avoidance while\nmaintaining real-time performance through innovative designs. Bunny-VisionPro\noutperforms prior systems on a standard task suite, achieving higher success\nrates and reduced task completion times. Moreover, the high-quality\nteleoperation demonstrations improve downstream imitation learning performance,\nleading to better generalizability. Notably, Bunny-VisionPro enables imitation\nlearning with challenging multi-stage, long-horizon dexterous manipulation\ntasks, which have rarely been addressed in previous work. Our system's ability\nto handle bimanual manipulations while prioritizing safety and real-time\nperformance makes it a powerful tool for advancing dexterous manipulation and\nimitation learning.\n","authors":["Runyu Ding","Yuzhe Qin","Jiyue Zhu","Chengzhe Jia","Shiqi Yang","Ruihan Yang","Xiaojuan Qi","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03162v1.pdf","comment":"project page: https://dingry.github.io/projects/bunny_visionpro.html"},{"id":"http://arxiv.org/abs/2407.03160v1","updated":"2024-07-03T14:35:16Z","published":"2024-07-03T14:35:16Z","title":"SOS! Soft Prompt Attack Against Open-Source Large Language Models","summary":" Open-source large language models (LLMs) have become increasingly popular\namong both the general public and industry, as they can be customized,\nfine-tuned, and freely used. However, some open-source LLMs require approval\nbefore usage, which has led to third parties publishing their own easily\naccessible versions. Similarly, third parties have been publishing fine-tuned\nor quantized variants of these LLMs. These versions are particularly appealing\nto users because of their ease of access and reduced computational resource\ndemands. This trend has increased the risk of training time attacks,\ncompromising the integrity and security of LLMs. In this work, we present a new\ntraining time attack, SOS, which is designed to be low in computational demand\nand does not require clean data or modification of the model weights, thereby\nmaintaining the model's utility intact. The attack addresses security issues in\nvarious scenarios, including the backdoor attack, jailbreak attack, and prompt\nstealing attack. Our experimental findings demonstrate that the proposed attack\nis effective across all evaluated targets. Furthermore, we present the other\nside of our SOS technique, namely the copyright token -- a novel technique that\nenables users to mark their copyrighted content and prevent models from using\nit.\n","authors":["Ziqing Yang","Michael Backes","Yang Zhang","Ahmed Salem"],"pdf_url":"https://arxiv.org/pdf/2407.03160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03157v1","updated":"2024-07-03T14:34:03Z","published":"2024-07-03T14:34:03Z","title":"Let the Code LLM Edit Itself When You Edit the Code","summary":" In this work, we investigate a typical scenario in code generation where a\ndeveloper edits existing code in real time and requests a code assistant, e.g.,\na large language model, to re-predict the next token or next line on the fly.\nNaively, the LLM needs to re-encode the entire KV cache to provide an accurate\nprediction. However, this process is computationally expensive, especially when\nthe sequence length is long. Simply encoding the edited subsequence and\nintegrating it to the original KV cache meets the temporal confusion problem,\nleading to significantly worse performance. We address this efficiency and\naccuracy trade-off by introducing \\underline{\\textbf{Positional\n\\textbf{I}ntegrity \\textbf{E}ncoding} (PIE). Building upon the rotary\npositional encoding, PIE first removes the rotary matrices in the Key cache\nthat introduce temporal confusion and then reapplies the correct rotary\nmatrices. This process ensures that positional relationships between tokens are\ncorrect and requires only a single round of matrix multiplication. We validate\nthe effectiveness of PIE through extensive experiments on the RepoBench-C-8k\ndataset, utilizing DeepSeek-Coder models with 1.3B, 6.7B, and 33B parameters.\nOur evaluation includes three real-world coding tasks: code insertion, code\ndeletion, and multi-place code editing. Results demonstrate that PIE reduces\ncomputational overhead by over 85% compared to the standard full recomputation\napproach across all model sizes and tasks while well approximating the model\nperformance.\n","authors":["Zhenyu He","Jun Zhang","Shengjie Luo","Jingjing Xu","Zhi Zhang","Di He"],"pdf_url":"https://arxiv.org/pdf/2407.03157v1.pdf","comment":"Preprint. Work in Progress"},{"id":"http://arxiv.org/abs/2407.03154v1","updated":"2024-07-03T14:31:36Z","published":"2024-07-03T14:31:36Z","title":"Reinforcement Learning for Sequence Design Leveraging Protein Language\n Models","summary":" Protein sequence design, determined by amino acid sequences, are essential to\nprotein engineering problems in drug discovery. Prior approaches have resorted\nto evolutionary strategies or Monte-Carlo methods for protein design, but often\nfail to exploit the structure of the combinatorial search space, to generalize\nto unseen sequences. In the context of discrete black box optimization over\nlarge search spaces, learning a mutation policy to generate novel sequences\nwith reinforcement learning is appealing. Recent advances in protein language\nmodels (PLMs) trained on large corpora of protein sequences offer a potential\nsolution to this problem by scoring proteins according to their biological\nplausibility (such as the TM-score). In this work, we propose to use PLMs as a\nreward function to generate new sequences. Yet the PLM can be computationally\nexpensive to query due to its large size. To this end, we propose an\nalternative paradigm where optimization can be performed on scores from a\nsmaller proxy model that is periodically finetuned, jointly while learning the\nmutation policy. We perform extensive experiments on various sequence lengths\nto benchmark RL-based approaches, and provide comprehensive evaluations along\nbiological plausibility and diversity of the protein. Our experimental results\ninclude favorable evaluations of the proposed sequences, along with high\ndiversity scores, demonstrating that RL is a strong candidate for biological\nsequence design. Finally, we provide a modular open source implementation can\nbe easily integrated in most RL training loops, with support for replacing the\nreward model with other PLMs, to spur further research in this domain. The code\nfor all experiments is provided in the supplementary material.\n","authors":["Jithendaraa Subramanian","Shivakanth Sujit","Niloy Irtisam","Umong Sain","Derek Nowrouzezahrai","Samira Ebrahimi Kahou","Riashat Islam"],"pdf_url":"https://arxiv.org/pdf/2407.03154v1.pdf","comment":"22 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.03152v1","updated":"2024-07-03T14:30:47Z","published":"2024-07-03T14:30:47Z","title":"Stereo Risk: A Continuous Modeling Approach to Stereo Matching","summary":" We introduce Stereo Risk, a new deep-learning approach to solve the classical\nstereo-matching problem in computer vision. As it is well-known that stereo\nmatching boils down to a per-pixel disparity estimation problem, the popular\nstate-of-the-art stereo-matching approaches widely rely on regressing the scene\ndisparity values, yet via discretization of scene disparity values. Such\ndiscretization often fails to capture the nuanced, continuous nature of scene\ndepth. Stereo Risk departs from the conventional discretization approach by\nformulating the scene disparity as an optimal solution to a continuous risk\nminimization problem, hence the name \"stereo risk\". We demonstrate that $L^1$\nminimization of the proposed continuous risk function enhances stereo-matching\nperformance for deep networks, particularly for disparities with multi-modal\nprobability distributions. Furthermore, to enable the end-to-end network\ntraining of the non-differentiable $L^1$ risk optimization, we exploited the\nimplicit function theorem, ensuring a fully differentiable network. A\ncomprehensive analysis demonstrates our method's theoretical soundness and\nsuperior performance over the state-of-the-art methods across various benchmark\ndatasets, including KITTI 2012, KITTI 2015, ETH3D, SceneFlow, and Middlebury\n2014.\n","authors":["Ce Liu","Suryansh Kumar","Shuhang Gu","Radu Timofte","Yao Yao","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2407.03152v1.pdf","comment":"Accepted as an Oral Paper at ICML 2024. Draft info: 18 pages, 6\n Figure, 16 Tables"},{"id":"http://arxiv.org/abs/2403.19887v2","updated":"2024-07-03T14:30:33Z","published":"2024-03-28T23:55:06Z","title":"Jamba: A Hybrid Transformer-Mamba Language Model","summary":" We present Jamba, a new base large language model based on a novel hybrid\nTransformer-Mamba mixture-of-experts (MoE) architecture. Specifically, Jamba\ninterleaves blocks of Transformer and Mamba layers, enjoying the benefits of\nboth model families. MoE is added in some of these layers to increase model\ncapacity while keeping active parameter usage manageable. This flexible\narchitecture allows resource- and objective-specific configurations. In the\nparticular configuration we have implemented, we end up with a powerful model\nthat fits in a single 80GB GPU. Built at large scale, Jamba provides high\nthroughput and small memory footprint compared to vanilla Transformers, and at\nthe same time state-of-the-art performance on standard language model\nbenchmarks and long-context evaluations. Remarkably, the model presents strong\nresults for up to 256K tokens context length. We study various architectural\ndecisions, such as how to combine Transformer and Mamba layers, and how to mix\nexperts, and show that some of them are crucial in large scale modeling. We\nalso describe several interesting properties of these architectures which the\ntraining and evaluation of Jamba have revealed, and plan to release checkpoints\nfrom various ablation runs, to encourage further exploration of this novel\narchitecture. We make the weights of our implementation of Jamba publicly\navailable under a permissive license.\n","authors":["Opher Lieber","Barak Lenz","Hofit Bata","Gal Cohen","Jhonathan Osin","Itay Dalmedigos","Erez Safahi","Shaked Meirom","Yonatan Belinkov","Shai Shalev-Shwartz","Omri Abend","Raz Alon","Tomer Asida","Amir Bergman","Roman Glozman","Michael Gokhman","Avashalom Manevich","Nir Ratner","Noam Rozen","Erez Shwartz","Mor Zusman","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2403.19887v2.pdf","comment":"Webpage: https://www.ai21.com/jamba"},{"id":"http://arxiv.org/abs/2402.15171v2","updated":"2024-07-03T14:29:43Z","published":"2024-02-23T08:07:54Z","title":"Towards Efficient and Optimal Covariance-Adaptive Algorithms for\n Combinatorial Semi-Bandits","summary":" We address the problem of stochastic combinatorial semi-bandits, where a\nplayer selects among $P$ actions from the power set of a set containing $d$\nbase items. Adaptivity to the problem's structure is essential in order to\nobtain optimal regret upper bounds. As estimating the coefficients of a\ncovariance matrix can be manageable in practice, leveraging them should improve\nthe regret. We design ``optimistic'' covariance-adaptive algorithms relying on\nonline estimations of the covariance structure, called OLSUCBC and COSV (only\nthe variances for the latter). They both yields improved gap-free regret.\nAlthough COSV can be slightly suboptimal, it improves on computational\ncomplexity by taking inspiration from Thompson Sampling approaches. It is the\nfirst sampling-based algorithm satisfying a $\\sqrt{T}$ gap-free regret (up to\npoly-logs). We also show that in some cases, our approach efficiently leverages\nthe semi-bandit feedback and outperforms bandit feedback approaches, not only\nin exponential regimes where $P\\gg d$ but also when $P\\leq d$, which is not\ncovered by existing analyses.\n","authors":["Julien Zhou","Pierre Gaillard","Thibaud Rahier","Houssam Zenati","Julyan Arbel"],"pdf_url":"https://arxiv.org/pdf/2402.15171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09009v4","updated":"2024-07-03T14:24:39Z","published":"2024-06-13T11:29:21Z","title":"Fredformer: Frequency Debiased Transformer for Time Series Forecasting","summary":" The Transformer model has shown leading performance in time series\nforecasting. Nevertheless, in some complex scenarios, it tends to learn\nlow-frequency features in the data and overlook high-frequency features,\nshowing a frequency bias. This bias prevents the model from accurately\ncapturing important high-frequency data features. In this paper, we undertook\nempirical analyses to understand this bias and discovered that frequency bias\nresults from the model disproportionately focusing on frequency features with\nhigher energy. Based on our analysis, we formulate this bias and propose\nFredformer, a Transformer-based framework designed to mitigate frequency bias\nby learning features equally across different frequency bands. This approach\nprevents the model from overlooking lower amplitude features important for\naccurate forecasting. Extensive experiments show the effectiveness of our\nproposed approach, which can outperform other baselines in different real-world\ntime-series datasets. Furthermore, we introduce a lightweight variant of the\nFredformer with an attention matrix approximation, which achieves comparable\nperformance but with much fewer parameters and lower computation costs. The\ncode is available at: https://github.com/chenzRG/Fredformer\n","authors":["Xihao Piao","Zheng Chen","Taichi Murayama","Yasuko Matsubara","Yasushi Sakurai"],"pdf_url":"https://arxiv.org/pdf/2406.09009v4.pdf","comment":"This paper has been accepted by SIGKDD2024"},{"id":"http://arxiv.org/abs/2407.03132v1","updated":"2024-07-03T14:13:04Z","published":"2024-07-03T14:13:04Z","title":"Speaker- and Text-Independent Estimation of Articulatory Movements and\n Phoneme Alignments from Speech","summary":" This paper introduces a novel combination of two tasks, previously treated\nseparately: acoustic-to-articulatory speech inversion (AAI) and\nphoneme-to-articulatory (PTA) motion estimation. We refer to this joint task as\nacoustic phoneme-to-articulatory speech inversion (APTAI) and explore two\ndifferent approaches, both working speaker- and text-independently during\ninference. We use a multi-task learning setup, with the end-to-end goal of\ntaking raw speech as input and estimating the corresponding articulatory\nmovements, phoneme sequence, and phoneme alignment. While both proposed\napproaches share these same requirements, they differ in their way of achieving\nphoneme-related predictions: one is based on frame classification, the other on\na two-staged training procedure and forced alignment. We reach competitive\nperformance of 0.73 mean correlation for the AAI task and achieve up to\napproximately 87% frame overlap compared to a state-of-the-art text-dependent\nphoneme force aligner.\n","authors":["Tobias Weise","Philipp Klumpp","Kubilay Can Demir","Paula Andrea Pérez-Toro","Maria Schuster","Elmar Noeth","Bjoern Heismann","Andreas Maier","Seung Hee Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03132v1.pdf","comment":"to be published in Interspeech 2024 proceedings"},{"id":"http://arxiv.org/abs/2407.03125v1","updated":"2024-07-03T14:07:41Z","published":"2024-07-03T14:07:41Z","title":"Foundations and Frontiers of Graph Learning Theory","summary":" Recent advancements in graph learning have revolutionized the way to\nunderstand and analyze data with complex structures. Notably, Graph Neural\nNetworks (GNNs), i.e. neural network architectures designed for learning graph\nrepresentations, have become a popular paradigm. With these models being\nusually characterized by intuition-driven design or highly intricate\ncomponents, placing them within the theoretical analysis framework to distill\nthe core concepts, helps understand the key principles that drive the\nfunctionality better and guide further development. Given this surge in\ninterest, this article provides a comprehensive summary of the theoretical\nfoundations and breakthroughs concerning the approximation and learning\nbehaviors intrinsic to prevalent graph learning models. Encompassing\ndiscussions on fundamental aspects such as expressiveness power,\ngeneralization, optimization, and unique phenomena such as over-smoothing and\nover-squashing, this piece delves into the theoretical foundations and frontier\ndriving the evolution of graph learning. In addition, this article also\npresents several challenges and further initiates discussions on possible\nsolutions.\n","authors":["Yu Huang","Min Zhou","Menglin Yang","Zhen Wang","Muhan Zhang","Jie Wang","Hong Xie","Hao Wang","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2407.03125v1.pdf","comment":"36pages,273references"},{"id":"http://arxiv.org/abs/2406.14322v2","updated":"2024-07-03T14:05:20Z","published":"2024-06-20T13:54:32Z","title":"Mind the Privacy Unit! User-Level Differential Privacy for Language\n Model Fine-Tuning","summary":" Large language models (LLMs) have emerged as powerful tools for tackling\ncomplex tasks across diverse domains, but they also raise privacy concerns when\nfine-tuned on sensitive data due to potential memorization. While differential\nprivacy (DP) offers a promising solution by ensuring models are 'almost\nindistinguishable' with or without any particular privacy unit, current\nevaluations on LLMs mostly treat each example (text record) as the privacy\nunit. This leads to uneven user privacy guarantees when contributions per user\nvary. We therefore study user-level DP motivated by applications where it\nnecessary to ensure uniform privacy protection across users. We present a\nsystematic evaluation of user-level DP for LLM fine-tuning on natural language\ngeneration tasks. Focusing on two mechanisms for achieving user-level DP\nguarantees, Group Privacy and User-wise DP-SGD, we investigate design choices\nlike data selection strategies and parameter tuning for the best\nprivacy-utility tradeoff.\n","authors":["Lynn Chua","Badih Ghazi","Yangsibo Huang","Pritish Kamath","Ravi Kumar","Daogao Liu","Pasin Manurangsi","Amer Sinha","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03118v1","updated":"2024-07-03T14:04:05Z","published":"2024-07-03T14:04:05Z","title":"Can machine learning solve the challenge of adaptive learning and the\n individualization of learning paths? A field experiment in an online learning\n platform","summary":" The individualization of learning contents based on digital technologies\npromises large individual and social benefits. However, it remains an open\nquestion how this individualization can be implemented. To tackle this question\nwe conduct a randomized controlled trial on a large digital self-learning\nplatform. We develop an algorithm based on two convolutional neural networks\nthat assigns tasks to $4,365$ learners according to their learning paths.\nLearners are randomized into three groups: two treatment groups -- a\ngroup-based adaptive treatment group and an individual adaptive treatment group\n-- and one control group. We analyze the difference between the three groups\nwith respect to effort learners provide and their performance on the platform.\nOur null results shed light on the multiple challenges associated with the\nindividualization of learning paths.\n","authors":["Tim Klausmann","Marius Köppel","Daniel Schunk","Isabell Zipperle"],"pdf_url":"https://arxiv.org/pdf/2407.03118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05369v2","updated":"2024-07-03T13:53:06Z","published":"2024-02-08T02:58:47Z","title":"Noise Contrastive Alignment of Language Models with Explicit Rewards","summary":" User intentions are typically formalized as evaluation rewards to be\nmaximized when fine-tuning language models (LMs). Existing alignment methods,\nsuch as Direct Preference Optimization (DPO), are mainly tailored for pairwise\npreference data where rewards are implicitly defined rather than explicitly\ngiven. In this paper, we introduce a general framework for LM alignment,\nleveraging Noise Contrastive Estimation (NCE) to bridge the gap in handling\nreward datasets explicitly annotated with scalar evaluations. Our framework\ncomprises two parallel algorithms, NCA and InfoNCA, both enabling the direct\nextraction of an LM policy from reward data as well as preference data.\nNotably, we show that the DPO loss is a special case of our proposed InfoNCA\nobjective under pairwise preference settings, thereby integrating and extending\ncurrent alignment theories. By comparing NCA and InfoNCA, we demonstrate that\nthe well-observed decreasing-likelihood trend of DPO/InfoNCA is caused by their\nfocus on adjusting relative likelihood across different responses. In contrast,\nNCA optimizes the absolute likelihood for each response, thereby effectively\npreventing the chosen likelihood from decreasing. We evaluate our methods in\nboth reward and preference settings with Mistral-8*7B and 7B models.\nExperiments suggest that InfoNCA/NCA surpasses various preference baselines\nwhen reward datasets are available. We also find NCA significantly outperforms\nDPO in complex reasoning tasks like math and coding.\n","authors":["Huayu Chen","Guande He","Lifan Yuan","Ganqu Cui","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.05369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03108v1","updated":"2024-07-03T13:47:41Z","published":"2024-07-03T13:47:41Z","title":"How Reliable and Stable are Explanations of XAI Methods?","summary":" Black box models are increasingly being used in the daily lives of human\nbeings living in society. Along with this increase, there has been the\nemergence of Explainable Artificial Intelligence (XAI) methods aimed at\ngenerating additional explanations regarding how the model makes certain\npredictions. In this sense, methods such as Dalex, Eli5, eXirt, Lofo and Shap\nemerged as different proposals and methodologies for generating explanations of\nblack box models in an agnostic way. Along with the emergence of these methods,\nquestions arise such as \"How Reliable and Stable are XAI Methods?\". With the\naim of shedding light on this main question, this research creates a pipeline\nthat performs experiments using the diabetes dataset and four different machine\nlearning models (LGBM, MLP, DT and KNN), creating different levels of\nperturbations of the test data and finally generates explanations from the\neXirt method regarding the confidence of the models and also feature relevances\nranks from all XAI methods mentioned, in order to measure their stability in\nthe face of perturbations. As a result, it was found that eXirt was able to\nidentify the most reliable models among all those used. It was also found that\ncurrent XAI methods are sensitive to perturbations, with the exception of one\nspecific method.\n","authors":["José Ribeiro","Lucas Cardoso","Vitor Santos","Eduardo Carvalho","Níkolas Carneiro","Ronnie Alves"],"pdf_url":"https://arxiv.org/pdf/2407.03108v1.pdf","comment":"15 pages, 6 figures, submitted to BRACIS 2024"},{"id":"http://arxiv.org/abs/2402.13228v2","updated":"2024-07-03T13:46:33Z","published":"2024-02-20T18:42:34Z","title":"Smaug: Fixing Failure Modes of Preference Optimisation with DPO-Positive","summary":" Direct Preference Optimisation (DPO) is effective at significantly improving\nthe performance of large language models (LLMs) on downstream tasks such as\nreasoning, summarisation, and alignment. Using pairs of preferred and\ndispreferred data, DPO models the relative probability of picking one response\nover another. In this work, first we show theoretically that the standard DPO\nloss can lead to a reduction of the model's likelihood of the preferred\nexamples, as long as the relative probability between the preferred and\ndispreferred classes increases. We then show empirically that this phenomenon\noccurs when fine-tuning LLMs on common datasets, especially datasets in which\nthe edit distance between pairs of completions is low. Using these insights, we\ndesign DPO-Positive (DPOP), a new loss function and training procedure which\navoids this failure mode. Surprisingly, we find that DPOP outperforms DPO and\nother fine-tuning procedures across a wide variety of datasets and downstream\ntasks, including datasets with high edit distances between completions.\nFurthermore, we find that the DPOP-tuned model outperforms the DPO-tuned model\n(all else equal) on benchmarks independent of the fine-tuning data, such as\nMT-Bench. Finally, using DPOP, we create and open-source Smaug-34B and\nSmaug-72B, with the latter becoming the first open-source LLM to surpass an\naverage accuracy of 80% on the HuggingFace Open LLM Leaderboard.\n","authors":["Arka Pal","Deep Karkhanis","Samuel Dooley","Manley Roberts","Siddartha Naidu","Colin White"],"pdf_url":"https://arxiv.org/pdf/2402.13228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00849v2","updated":"2024-07-03T13:43:56Z","published":"2024-02-29T16:10:21Z","title":"NeuraLUT: Hiding Neural Network Density in Boolean Synthesizable\n Functions","summary":" Field-Programmable Gate Array (FPGA) accelerators have proven successful in\nhandling latency- and resource-critical deep neural network (DNN) inference\ntasks. Among the most computationally intensive operations in a neural network\n(NN) is the dot product between the feature and weight vectors. Thus, some\nprevious FPGA acceleration works have proposed mapping neurons with quantized\ninputs and outputs directly to lookup tables (LUTs) for hardware\nimplementation. In these works, the boundaries of the neurons coincide with the\nboundaries of the LUTs. We propose relaxing these boundaries and mapping entire\nsub-networks to a single LUT. As the sub-networks are absorbed within the LUT,\nthe NN topology and precision within a partition do not affect the size of the\nlookup tables generated. Therefore, we utilize fully connected layers with\nfloating-point precision inside each partition, which benefit from being\nuniversal function approximators, but with rigid sparsity and quantization\nenforced between partitions, where the NN topology becomes exposed to the\ncircuit topology. Although cheap to implement, this approach can lead to very\ndeep NNs, and so to tackle challenges like vanishing gradients, we also\nintroduce skip connections inside the partitions. The resulting methodology can\nbe seen as training DNNs with a specific FPGA hardware-inspired sparsity\npattern that allows them to be mapped to much shallower circuit-level networks,\nthereby significantly improving latency. We validate our proposed method on a\nknown latency-critical task, jet substructure tagging, and on the classical\ncomputer vision task, digit classification using MNIST. Our approach allows for\ngreater function expressivity within the LUTs compared to existing work,\nleading to up to $4.3\\times$ lower latency NNs for the same accuracy.\n","authors":["Marta Andronic","George A. Constantinides"],"pdf_url":"https://arxiv.org/pdf/2403.00849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03105v1","updated":"2024-07-03T13:42:21Z","published":"2024-07-03T13:42:21Z","title":"On Generalization for Generative Flow Networks","summary":" Generative Flow Networks (GFlowNets) have emerged as an innovative learning\nparadigm designed to address the challenge of sampling from an unnormalized\nprobability distribution, called the reward function. This framework learns a\npolicy on a constructed graph, which enables sampling from an approximation of\nthe target probability distribution through successive steps of sampling from\nthe learned policy. To achieve this, GFlowNets can be trained with various\nobjectives, each of which can lead to the model s ultimate goal. The\naspirational strength of GFlowNets lies in their potential to discern intricate\npatterns within the reward function and their capacity to generalize\neffectively to novel, unseen parts of the reward function. This paper attempts\nto formalize generalization in the context of GFlowNets, to link generalization\nwith stability, and also to design experiments that assess the capacity of\nthese models to uncover unseen parts of the reward function. The experiments\nwill focus on length generalization meaning generalization to states that can\nbe constructed only by longer trajectories than those seen in training.\n","authors":["Anas Krichel","Nikolay Malkin","Salem Lahlou","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2407.03105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03094v1","updated":"2024-07-03T13:34:33Z","published":"2024-07-03T13:34:33Z","title":"Conformal Prediction for Causal Effects of Continuous Treatments","summary":" Uncertainty quantification of causal effects is crucial for safety-critical\napplications such as personalized medicine. A powerful approach for this is\nconformal prediction, which has several practical benefits due to\nmodel-agnostic finite-sample guarantees. Yet, existing methods for conformal\nprediction of causal effects are limited to binary/discrete treatments and make\nhighly restrictive assumptions such as known propensity scores. In this work,\nwe provide a novel conformal prediction method for potential outcomes of\ncontinuous treatments. We account for the additional uncertainty introduced\nthrough propensity estimation so that our conformal prediction intervals are\nvalid even if the propensity score is unknown. Our contributions are\nthree-fold: (1) We derive finite-sample prediction intervals for potential\noutcomes of continuous treatments. (2) We provide an algorithm for calculating\nthe derived intervals. (3) We demonstrate the effectiveness of the conformal\nprediction intervals in experiments on synthetic and real-world datasets. To\nthe best of our knowledge, we are the first to propose conformal prediction for\ncontinuous treatments when the propensity score is unknown and must be\nestimated from data.\n","authors":["Maresa Schröder","Dennis Frauen","Jonas Schweisthal","Konstantin Heß","Valentyn Melnychuk","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2407.03094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03093v1","updated":"2024-07-03T13:34:30Z","published":"2024-07-03T13:34:30Z","title":"Revisiting the Performance of Deep Learning-Based Vulnerability\n Detection on Realistic Datasets","summary":" The impact of software vulnerabilities on everyday software systems is\nsignificant. Despite deep learning models being proposed for vulnerability\ndetection, their reliability is questionable. Prior evaluations show high\nrecall/F1 scores of up to 99%, but these models underperform in practical\nscenarios, particularly when assessed on entire codebases rather than just the\nfixing commit. This paper introduces Real-Vul, a comprehensive dataset\nrepresenting real-world scenarios for evaluating vulnerability detection\nmodels. Evaluating DeepWukong, LineVul, ReVeal, and IVDetect shows a\nsignificant drop in performance, with precision decreasing by up to 95\npercentage points and F1 scores by up to 91 points. Furthermore, Model\nperformance fluctuates based on vulnerability characteristics, with better F1\nscores for information leaks or code injection than for path resolution or\npredictable return values. The results highlight a significant performance gap\nthat needs addressing before deploying deep learning-based vulnerability\ndetection in practical settings. Overfitting is identified as a key issue, and\nan augmentation technique is proposed, potentially improving performance by up\nto 30%. Contributions include a dataset creation approach for better model\nevaluation, Real-Vul dataset, and empirical evidence of deep learning models\nstruggling in real-world settings.\n","authors":["Partha Chakraborty","Krishna Kanth Arumugam","Mahmoud Alfadel","Meiyappan Nagappan","Shane McIntosh"],"pdf_url":"https://arxiv.org/pdf/2407.03093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18470v2","updated":"2024-07-03T13:32:34Z","published":"2024-06-26T16:28:24Z","title":"UFRec: Integrating Uniformity and Frequency to Enhance Sequential\n Recommendations","summary":" Effective representation learning in sequential recommendation systems is\npivotal for precisely capturing user interaction patterns and enhancing\nrecommendation accuracy. Nonetheless, current methodologies largely focus on\nitem-to-item transitions, frequently overlooking the time intervals between\ninteractions, which are integral to understanding behavior pattern shifts.\nMoreover, critical interaction attributes like item frequency are often\nneglected. Our research indicates that sequences with more consistent time\nintervals and items with higher interaction frequency result in superior\npredictive performance. In contrast, sequences with non-uniform intervals\ncontribute to user interest drift, and infrequently interacted items are\nchallenging to model due to sparse data, posing unique challenges that existing\nmethods fail to adequately address. In this study, we introduce UFRec, an\ninnovative bidirectional enhancement method for sequential recommendations.\nUFRec harnesses sequence uniformity and item frequency to boost performance,\nparticularly improving the representation of non-uniform sequences and\nless-frequent items. These two components synergistically enhance each other,\ndriving holistic performance optimization in intricate sequential\nrecommendation scenarios. Additionally, we introduce a multidimensional time\nmodule to further augment adaptability. To the best of our knowledge, UFRec is\nthe pioneering method to exploit the properties of uniformity and frequency for\nfeature augmentation. Through comparisons with eleven state-of-the-art models\nacross four datasets, we demonstrate that UFRec significantly surpasses current\nleading models.\n","authors":["Yang Liu","Yitong Wang","Chenyue Feng"],"pdf_url":"https://arxiv.org/pdf/2406.18470v2.pdf","comment":"15 pages, 8 figures, for source code, see\n https://github.com/Linxi000/UniRec"},{"id":"http://arxiv.org/abs/2407.03089v1","updated":"2024-07-03T13:26:31Z","published":"2024-07-03T13:26:31Z","title":"Spatio-Temporal Adaptive Diffusion Models for EEG Super-Resolution in\n Epilepsy Diagnosis","summary":" Electroencephalogram (EEG) technology, particularly high-density EEG (HD EEG)\ndevices, is widely used in fields such as neuroscience. HD EEG devices improve\nthe spatial resolution of EEG by placing more electrodes on the scalp, meeting\nthe requirements of clinical diagnostic applications such as epilepsy focus\nlocalization. However, this technique faces challenges such as high acquisition\ncosts and limited usage scenarios. In this paper, spatio-temporal adaptive\ndiffusion models (STADMs) are proposed to pioneer the use of diffusion models\nfor achieving spatial SR reconstruction from low-resolution (LR, 64 channels or\nfewer) EEG to high-resolution (HR, 256 channels) EEG. Specifically, a\nspatio-temporal condition module is designed to extract the spatio-temporal\nfeatures of LR EEG, which then serve as conditional inputs to guide the reverse\ndenoising process of diffusion models. Additionally, a multi-scale Transformer\ndenoising module is constructed to leverage multi-scale convolution blocks and\ncross-attention-based diffusion Transformer blocks for conditional guidance to\ngenerate subject-adaptive SR EEG. Experimental results demonstrate that the\nproposed method effectively enhances the spatial resolution of LR EEG and\nquantitatively outperforms existing methods. Furthermore, STADMs demonstrate\ntheir value by applying synthetic SR EEG to classification and source\nlocalization tasks of epilepsy patients, indicating their potential to\nsignificantly improve the spatial resolution of LR EEG.\n","authors":["Tong Zhou","Shuqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02466v2","updated":"2024-07-03T13:24:02Z","published":"2024-07-02T17:47:03Z","title":"PWM: Policy Learning with Large World Models","summary":" Reinforcement Learning (RL) has achieved impressive results on complex tasks\nbut struggles in multi-task settings with different embodiments. World models\noffer scalability by learning a simulation of the environment, yet they often\nrely on inefficient gradient-free optimization methods. We introduce Policy\nlearning with large World Models (PWM), a novel model-based RL algorithm that\nlearns continuous control policies from large multi-task world models. By\npre-training the world model on offline data and using it for first-order\ngradient policy learning, PWM effectively solves tasks with up to 152 action\ndimensions and outperforms methods using ground-truth dynamics. Additionally,\nPWM scales to an 80-task setting, achieving up to 27% higher rewards than\nexisting baselines without the need for expensive online planning.\nVisualizations and code available at https://www.imgeorgiev.com/pwm\n","authors":["Ignat Georgiev","Varun Giridhar","Nicklas Hansen","Animesh Garg"],"pdf_url":"https://arxiv.org/pdf/2407.02466v2.pdf","comment":"Visualizations and code available at https://www.imgeorgiev.com/pwm"},{"id":"http://arxiv.org/abs/2407.03086v1","updated":"2024-07-03T13:15:12Z","published":"2024-07-03T13:15:12Z","title":"Effective Heterogeneous Federated Learning via Efficient\n Hypernetwork-based Weight Generation","summary":" While federated learning leverages distributed client resources, it faces\nchallenges due to heterogeneous client capabilities. This necessitates\nallocating models suited to clients' resources and careful parameter\naggregation to accommodate this heterogeneity. We propose HypeMeFed, a novel\nfederated learning framework for supporting client heterogeneity by combining a\nmulti-exit network architecture with hypernetwork-based model weight\ngeneration. This approach aligns the feature spaces of heterogeneous model\nlayers and resolves per-layer information disparity during weight aggregation.\nTo practically realize HypeMeFed, we also propose a low-rank factorization\napproach to minimize computation and memory overhead associated with\nhypernetworks. Our evaluations on a real-world heterogeneous device testbed\nindicate that HypeMeFed enhances accuracy by 5.12% over FedAvg, reduces the\nhypernetwork memory requirements by 98.22%, and accelerates its operations by\n1.86 times compared to a naive hypernetwork approach. These results demonstrate\nHypeMeFed's effectiveness in leveraging and engaging heterogeneous clients for\nfederated learning.\n","authors":["Yujin Shin","Kichang Lee","Sungmin Lee","You Rim Choi","Hyung-Sin Kim","JeongGil Ko"],"pdf_url":"https://arxiv.org/pdf/2407.03086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09933v3","updated":"2024-07-03T13:08:04Z","published":"2022-10-18T15:30:14Z","title":"Explanations Based on Item Response Theory (eXirt): A Model-Specific\n Method to Explain Tree-Ensemble Model in Trust Perspective","summary":" In recent years, XAI researchers have been formalizing proposals and\ndeveloping new methods to explain black box models, with no general consensus\nin the community on which method to use to explain these models, with this\nchoice being almost directly linked to the popularity of a specific method.\nMethods such as Ciu, Dalex, Eli5, Lofo, Shap and Skater emerged with the\nproposal to explain black box models through global rankings of feature\nrelevance, which based on different methodologies, generate global explanations\nthat indicate how the model's inputs explain its predictions. In this context,\n41 datasets, 4 tree-ensemble algorithms (Light Gradient Boosting, CatBoost,\nRandom Forest, and Gradient Boosting), and 6 XAI methods were used to support\nthe launch of a new XAI method, called eXirt, based on Item Response Theory -\nIRT and aimed at tree-ensemble black box models that use tabular data referring\nto binary classification problems. In the first set of analyses, the 164 global\nfeature relevance ranks of the eXirt were compared with 984 ranks of the other\nXAI methods present in the literature, seeking to highlight their similarities\nand differences. In a second analysis, exclusive explanations of the eXirt\nbased on Explanation-by-example were presented that help in understanding the\nmodel trust. Thus, it was verified that eXirt is able to generate global\nexplanations of tree-ensemble models and also local explanations of instances\nof models through IRT, showing how this consolidated theory can be used in\nmachine learning in order to obtain explainable and reliable models.\n","authors":["José Ribeiro","Lucas Cardoso","Raíssa Silva","Vitor Cirilo","Níkolas Carneiro","Ronnie Alves"],"pdf_url":"https://arxiv.org/pdf/2210.09933v3.pdf","comment":"59 pages, 16 Figures, 3 Equations, 6 table"},{"id":"http://arxiv.org/abs/2407.03082v1","updated":"2024-07-03T13:03:51Z","published":"2024-07-03T13:03:51Z","title":"Stable Heterogeneous Treatment Effect Estimation across\n Out-of-Distribution Populations","summary":" Heterogeneous treatment effect (HTE) estimation is vital for understanding\nthe change of treatment effect across individuals or subgroups. Most existing\nHTE estimation methods focus on addressing selection bias induced by imbalanced\ndistributions of confounders between treated and control units, but ignore\ndistribution shifts across populations. Thereby, their applicability has been\nlimited to the in-distribution (ID) population, which shares a similar\ndistribution with the training dataset. In real-world applications, where\npopulation distributions are subject to continuous changes, there is an urgent\nneed for stable HTE estimation across out-of-distribution (OOD) populations,\nwhich, however, remains an open problem. As pioneers in resolving this problem,\nwe propose a novel Stable Balanced Representation Learning with\nHierarchical-Attention Paradigm (SBRL-HAP) framework, which consists of 1)\nBalancing Regularizer for eliminating selection bias, 2) Independence\nRegularizer for addressing the distribution shift issue, 3)\nHierarchical-Attention Paradigm for coordination between balance and\nindependence. In this way, SBRL-HAP regresses counterfactual outcomes using ID\ndata, while ensuring the resulting HTE estimation can be successfully\ngeneralized to out-of-distribution scenarios, thereby enhancing the model's\napplicability in real-world settings. Extensive experiments conducted on\nsynthetic and real-world datasets demonstrate the effectiveness of our SBRL-HAP\nin achieving stable HTE estimation across OOD populations, with an average 10%\nreduction in the error metric PEHE and 11% decrease in the ATE bias, compared\nto the SOTA methods.\n","authors":["Yuling Zhang","Anpeng Wu","Kun Kuang","Liang Du","Zixun Sun","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03082v1.pdf","comment":"Accepted by ICDE'2024"},{"id":"http://arxiv.org/abs/2407.03080v1","updated":"2024-07-03T12:53:42Z","published":"2024-07-03T12:53:42Z","title":"Artificial Inductive Bias for Synthetic Tabular Data Generation in\n Data-Scarce Scenarios","summary":" While synthetic tabular data generation using Deep Generative Models (DGMs)\noffers a compelling solution to data scarcity and privacy concerns, their\neffectiveness relies on substantial training data, often unavailable in\nreal-world applications. This paper addresses this challenge by proposing a\nnovel methodology for generating realistic and reliable synthetic tabular data\nwith DGMs in limited real-data environments. Our approach proposes several ways\nto generate an artificial inductive bias in a DGM through transfer learning and\nmeta-learning techniques. We explore and compare four different methods within\nthis framework, demonstrating that transfer learning strategies like\npre-training and model averaging outperform meta-learning approaches, like\nModel-Agnostic Meta-Learning, and Domain Randomized Search. We validate our\napproach using two state-of-the-art DGMs, namely, a Variational Autoencoder and\na Generative Adversarial Network, to show that our artificial inductive bias\nfuels superior synthetic data quality, as measured by Jensen-Shannon\ndivergence, achieving relative gains of up to 50\\% when using our proposed\napproach. This methodology has broad applicability in various DGMs and machine\nlearning tasks, particularly in areas like healthcare and finance, where data\nscarcity is often a critical issue.\n","authors":["Patricia A. Apellániz","Ana Jiménez","Borja Arroyo Galende","Juan Parras","Santiago Zazo"],"pdf_url":"https://arxiv.org/pdf/2407.03080v1.pdf","comment":"19 pages, 6 Figures"},{"id":"http://arxiv.org/abs/2405.14527v2","updated":"2024-07-03T12:39:58Z","published":"2024-05-23T13:11:49Z","title":"ArchesWeather: An efficient AI weather forecasting model at 1.5°\n resolution","summary":" One of the guiding principles for designing AI-based weather forecasting\nsystems is to embed physical constraints as inductive priors in the neural\nnetwork architecture. A popular prior is locality, where the atmospheric data\nis processed with local neural interactions, like 3D convolutions or 3D local\nattention windows as in Pangu-Weather. On the other hand, some works have shown\ngreat success in weather forecasting without this locality principle, at the\ncost of a much higher parameter count. In this paper, we show that the 3D local\nprocessing in Pangu-Weather is computationally sub-optimal. We design\nArchesWeather, a transformer model that combines 2D attention with a\ncolumn-wise attention-based feature interaction module, and demonstrate that\nthis design improves forecasting skill.\n ArchesWeather is trained at 1.5{\\deg} resolution and 24h lead time, with a\ntraining budget of a few GPU-days and a lower inference cost than competing\nmethods. An ensemble of four of our models shows better RMSE scores than the\nIFS HRES and is competitive with the 1.4{\\deg} 50-members NeuralGCM ensemble\nfor one to three days ahead forecasting. Our code and models are publicly\navailable at https://github.com/gcouairon/ArchesWeather.\n","authors":["Guillaume Couairon","Christian Lessig","Anastase Charantonis","Claire Monteleoni"],"pdf_url":"https://arxiv.org/pdf/2405.14527v2.pdf","comment":"Accepted at the Machine Learning for Earth System Modeling Workshop\n at ICML 2024"},{"id":"http://arxiv.org/abs/2407.03065v1","updated":"2024-07-03T12:36:24Z","published":"2024-07-03T12:36:24Z","title":"Warm-up Free Policy Optimization: Improved Regret in Linear Markov\n Decision Processes","summary":" Policy Optimization (PO) methods are among the most popular Reinforcement\nLearning (RL) algorithms in practice. Recently, Sherman et al. [2023a] proposed\na PO-based algorithm with rate-optimal regret guarantees under the linear\nMarkov Decision Process (MDP) model. However, their algorithm relies on a\ncostly pure exploration warm-up phase that is hard to implement in practice.\nThis paper eliminates this undesired warm-up phase, replacing it with a simple\nand efficient contraction mechanism. Our PO algorithm achieves rate-optimal\nregret with improved dependence on the other parameters of the problem (horizon\nand function approximation dimension) in two fundamental settings: adversarial\nlosses with full-information feedback and stochastic losses with bandit\nfeedback.\n","authors":["Asaf Cassel","Aviv Rosenberg"],"pdf_url":"https://arxiv.org/pdf/2407.03065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03059v1","updated":"2024-07-03T12:30:39Z","published":"2024-07-03T12:30:39Z","title":"FairJob: A Real-World Dataset for Fairness in Online Systems","summary":" We introduce a fairness-aware dataset for job recommendation in advertising,\ndesigned to foster research in algorithmic fairness within real-world\nscenarios. It was collected and prepared to comply with privacy standards and\nbusiness confidentiality. An additional challenge is the lack of access to\nprotected user attributes such as gender, for which we propose a solution to\nobtain a proxy estimate. Despite being anonymized and including a proxy for a\nsensitive attribute, our dataset preserves predictive power and maintains a\nrealistic and challenging benchmark. This dataset addresses a significant gap\nin the availability of fairness-focused resources for high-impact domains like\nadvertising -- the actual impact being having access or not to precious\nemployment opportunities, where balancing fairness and utility is a common\nindustrial challenge. We also explore various stages in the advertising process\nwhere unfairness can occur and introduce a method to compute a fair utility\nmetric for the job recommendations in online systems case from a biased\ndataset. Experimental evaluations of bias mitigation techniques on the released\ndataset demonstrate potential improvements in fairness and the associated\ntrade-offs with utility.\n","authors":["Mariia Vladimirova","Federico Pavone","Eustache Diemert"],"pdf_url":"https://arxiv.org/pdf/2407.03059v1.pdf","comment":"24 pages, 15 figures"},{"id":"http://arxiv.org/abs/2407.03056v1","updated":"2024-07-03T12:24:40Z","published":"2024-07-03T12:24:40Z","title":"Improving Zero-shot Generalization of Learned Prompts via Unsupervised\n Knowledge Distillation","summary":" Vision-Language Models (VLMs) demonstrate remarkable zero-shot generalization\nto unseen tasks, but fall short of the performance of supervised methods in\ngeneralizing to downstream tasks with limited data. Prompt learning is emerging\nas a parameter-efficient method for adapting VLMs, but state-of-the-art\napproaches require annotated samples. In this paper we propose a novel approach\nto prompt learning based on unsupervised knowledge distillation from more\npowerful models. Our approach, which we call Knowledge Distillation Prompt\nLearning (KDPL), can be integrated into existing prompt learning techniques and\neliminates the need for labeled examples during adaptation. Our experiments on\nmore than ten standard benchmark datasets demonstrate that KDPL is very\neffective at improving generalization of learned prompts for zero-shot domain\ngeneralization, zero-shot cross-dataset generalization, and zero-shot\nbase-to-novel class generalization problems. KDPL requires no ground-truth\nlabels for adaptation, and moreover we show that even in the absence of any\nknowledge of training class names it can be used to effectively transfer\nknowledge. The code is publicly available at https://github.com/miccunifi/KDPL.\n","authors":["Marco Mistretta","Alberto Baldrati","Marco Bertini","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2407.03056v1.pdf","comment":"Accepted for publication at ECCV24"},{"id":"http://arxiv.org/abs/2402.17506v2","updated":"2024-07-03T12:13:21Z","published":"2024-02-27T13:46:45Z","title":"Thermodynamics-informed super-resolution of scarce temporal dynamics\n data","summary":" We present a method to increase the resolution of measurements of a physical\nsystem and subsequently predict its time evolution using thermodynamics-aware\nneural networks. Our method uses adversarial autoencoders, which reduce the\ndimensionality of the full order model to a set of latent variables that are\nenforced to match a prior, for example a normal distribution. Adversarial\nautoencoders are seen as generative models, and they can be trained to generate\nhigh-resolution samples from low-resoution inputs, meaning they can address the\nso-called super-resolution problem. Then, a second neural network is trained to\nlearn the physical structure of the latent variables and predict their temporal\nevolution. This neural network is known as an structure-preserving neural\nnetwork. It learns the metriplectic-structure of the system and applies a\nphysical bias to ensure that the first and second principles of thermodynamics\nare fulfilled. The integrated trajectories are decoded to their original\ndimensionality, as well as to the higher dimensionality space produced by the\nadversarial autoencoder and they are compared to the ground truth solution. The\nmethod is tested with two examples of flow over a cylinder, where the fluid\nproperties are varied between both examples.\n","authors":["Carlos Bermejo-Barbanoj","Beatriz Moya","Alberto Badías","Francisco Chinesta","Elías Cueto"],"pdf_url":"https://arxiv.org/pdf/2402.17506v2.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2407.03045v1","updated":"2024-07-03T12:10:41Z","published":"2024-07-03T12:10:41Z","title":"JailbreakHunter: A Visual Analytics Approach for Jailbreak Prompts\n Discovery from Large-Scale Human-LLM Conversational Datasets","summary":" Large Language Models (LLMs) have gained significant attention but also\nraised concerns due to the risk of misuse. Jailbreak prompts, a popular type of\nadversarial attack towards LLMs, have appeared and constantly evolved to breach\nthe safety protocols of LLMs. To address this issue, LLMs are regularly updated\nwith safety patches based on reported jailbreak prompts. However, malicious\nusers often keep their successful jailbreak prompts private to exploit LLMs. To\nuncover these private jailbreak prompts, extensive analysis of large-scale\nconversational datasets is necessary to identify prompts that still manage to\nbypass the system's defenses. This task is highly challenging due to the\nimmense volume of conversation data, diverse characteristics of jailbreak\nprompts, and their presence in complex multi-turn conversations. To tackle\nthese challenges, we introduce JailbreakHunter, a visual analytics approach for\nidentifying jailbreak prompts in large-scale human-LLM conversational datasets.\nWe have designed a workflow with three analysis levels: group-level,\nconversation-level, and turn-level. Group-level analysis enables users to grasp\nthe distribution of conversations and identify suspicious conversations using\nmultiple criteria, such as similarity with reported jailbreak prompts in\nprevious research and attack success rates. Conversation-level analysis\nfacilitates the understanding of the progress of conversations and helps\ndiscover jailbreak prompts within their conversation contexts. Turn-level\nanalysis allows users to explore the semantic similarity and token overlap\nbetween a singleturn prompt and the reported jailbreak prompts, aiding in the\nidentification of new jailbreak strategies. The effectiveness and usability of\nthe system were verified through multiple case studies and expert interviews.\n","authors":["Zhihua Jin","Shiyi Liu","Haotian Li","Xun Zhao","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2407.03045v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.03038v1","updated":"2024-07-03T12:02:24Z","published":"2024-07-03T12:02:24Z","title":"On the Client Preference of LLM Fine-tuning in Federated Learning","summary":" Reinforcement learning with human feedback (RLHF) fine-tunes a pretrained\nlarge language model (LLM) using preference datasets, enabling the LLM to\ngenerate outputs that align with human preferences. Given the sensitive nature\nof these preference datasets held by various clients, there is a need to\nimplement RLHF within a federated learning (FL) framework, where clients are\nreluctant to share their data due to privacy concerns. To address this, we\nintroduce a feasible framework in which clients collaboratively train a binary\nselector with their preference datasets using our proposed FedBis. With a\nwell-trained selector, we can further enhance the LLM that generates\nhuman-preferred completions. Meanwhile, we propose a novel algorithm,\nFedBiscuit, that trains multiple selectors by organizing clients into balanced\nand disjoint clusters based on their preferences. Compared to the FedBis,\nFedBiscuit demonstrates superior performance in simulating human preferences\nfor pairwise completions. Our extensive experiments on federated human\npreference datasets -- marking the first benchmark to address heterogeneous\ndata partitioning among clients -- demonstrate that FedBiscuit outperforms\nFedBis and even surpasses traditional centralized training.\n","authors":["Feijie Wu","Xiaoze Liu","Haoyu Wang","Xingchen Wang","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2407.03038v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.13900v2","updated":"2024-07-03T11:58:22Z","published":"2023-08-26T15:02:00Z","title":"Semi-Supervised Semantic Segmentation via Marginal Contextual\n Information","summary":" We present a novel confidence refinement scheme that enhances pseudo labels\nin semi-supervised semantic segmentation. Unlike existing methods, which filter\npixels with low-confidence predictions in isolation, our approach leverages the\nspatial correlation of labels in segmentation maps by grouping neighboring\npixels and considering their pseudo labels collectively. With this contextual\ninformation, our method, named S4MC, increases the amount of unlabeled data\nused during training while maintaining the quality of the pseudo labels, all\nwith negligible computational overhead. Through extensive experiments on\nstandard benchmarks, we demonstrate that S4MC outperforms existing\nstate-of-the-art semi-supervised learning approaches, offering a promising\nsolution for reducing the cost of acquiring dense annotations. For example,\nS4MC achieves a 1.39 mIoU improvement over the prior art on PASCAL VOC 12 with\n366 annotated images. The code to reproduce our experiments is available at\nhttps://s4mcontext.github.io/\n","authors":["Moshe Kimhi","Shai Kimhi","Evgenii Zheltonozhskii","Or Litany","Chaim Baskin"],"pdf_url":"https://arxiv.org/pdf/2308.13900v2.pdf","comment":"Published at TMLR"},{"id":"http://arxiv.org/abs/2312.12616v3","updated":"2024-07-03T11:47:36Z","published":"2023-12-19T21:45:38Z","title":"Online Variational Sequential Monte Carlo","summary":" Being the most classical generative model for serial data, state-space models\n(SSM) are fundamental in AI and statistical machine learning. In SSM, any form\nof parameter learning or latent state inference typically involves the\ncomputation of complex latent-state posteriors. In this work, we build upon the\nvariational sequential Monte Carlo (VSMC) method, which provides\ncomputationally efficient and accurate model parameter estimation and Bayesian\nlatent-state inference by combining particle methods and variational inference.\nWhile standard VSMC operates in the offline mode, by re-processing repeatedly a\ngiven batch of data, we distribute the approximation of the gradient of the\nVSMC surrogate ELBO in time using stochastic approximation, allowing for online\nlearning in the presence of streams of data. This results in an algorithm,\nonline VSMC, that is capable of performing efficiently, entirely on-the-fly,\nboth parameter estimation and particle proposal adaptation. In addition, we\nprovide rigorous theoretical results describing the algorithm's convergence\nproperties as the number of data tends to infinity as well as numerical\nillustrations of its excellent convergence properties and usefulness also in\nbatch-processing settings.\n","authors":["Alessandro Mastrototaro","Jimmy Olsson"],"pdf_url":"https://arxiv.org/pdf/2312.12616v3.pdf","comment":"In this version there are better explanatory figures for the\n simulations in Section 5, and some text improvements/typos fixed"},{"id":"http://arxiv.org/abs/2308.14507v3","updated":"2024-07-03T11:43:58Z","published":"2023-08-28T11:49:23Z","title":"Spectral Estimators for Structured Generalized Linear Models via\n Approximate Message Passing","summary":" We consider the problem of parameter estimation in a high-dimensional\ngeneralized linear model. Spectral methods obtained via the principal\neigenvector of a suitable data-dependent matrix provide a simple yet\nsurprisingly effective solution. However, despite their wide use, a rigorous\nperformance characterization, as well as a principled way to preprocess the\ndata, are available only for unstructured (i.i.d.\\ Gaussian and Haar\northogonal) designs. In contrast, real-world data matrices are highly\nstructured and exhibit non-trivial correlations. To address the problem, we\nconsider correlated Gaussian designs capturing the anisotropic nature of the\nfeatures via a covariance matrix $\\Sigma$. Our main result is a precise\nasymptotic characterization of the performance of spectral estimators. This\nallows us to identify the optimal preprocessing that minimizes the number of\nsamples needed for parameter estimation. Surprisingly, such preprocessing is\nuniversal across a broad set of designs, which partly addresses a conjecture on\noptimal spectral estimators for rotationally invariant models. Our principled\napproach vastly improves upon previous heuristic methods, including for designs\ncommon in computational imaging and genetics. The proposed methodology, based\non approximate message passing, is broadly applicable and opens the way to the\nprecise characterization of spiked matrices and of the corresponding spectral\nmethods in a variety of settings.\n","authors":["Yihan Zhang","Hong Chang Ji","Ramji Venkataramanan","Marco Mondelli"],"pdf_url":"https://arxiv.org/pdf/2308.14507v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01781v2","updated":"2024-07-03T11:20:43Z","published":"2024-02-01T19:12:25Z","title":"When Benchmarks are Targets: Revealing the Sensitivity of Large Language\n Model Leaderboards","summary":" Large Language Model (LLM) leaderboards based on benchmark rankings are\nregularly used to guide practitioners in model selection. Often, the published\nleaderboard rankings are taken at face value - we show this is a (potentially\ncostly) mistake. Under existing leaderboards, the relative performance of LLMs\nis highly sensitive to (often minute) details. We show that for popular\nmultiple-choice question benchmarks (e.g., MMLU), minor perturbations to the\nbenchmark, such as changing the order of choices or the method of answer\nselection, result in changes in rankings up to 8 positions. We explain this\nphenomenon by conducting systematic experiments over three broad categories of\nbenchmark perturbations and identifying the sources of this behavior. Our\nanalysis results in several best-practice recommendations, including the\nadvantage of a hybrid scoring method for answer selection. Our study highlights\nthe dangers of relying on simple benchmark evaluations and charts the path for\nmore robust evaluation schemes on the existing benchmarks. The code for this\npaper is available at\nhttps://github.com/National-Center-for-AI-Saudi-Arabia/lm-evaluation-harness.\n","authors":["Norah Alzahrani","Hisham Abdullah Alyahya","Yazeed Alnumay","Sultan Alrashed","Shaykhah Alsubaie","Yusef Almushaykeh","Faisal Mirza","Nouf Alotaibi","Nora Altwairesh","Areeb Alowisheq","M Saiful Bari","Haidar Khan"],"pdf_url":"https://arxiv.org/pdf/2402.01781v2.pdf","comment":"updated with ACL 2024 camera ready version"},{"id":"http://arxiv.org/abs/2407.01823v2","updated":"2024-07-03T11:09:00Z","published":"2024-07-01T21:45:27Z","title":"Meta-Learning Based Optimization for Large Scale Wireless Systems","summary":" Optimization algorithms for wireless systems play a fundamental role in\nimproving their performance and efficiency. However, it is known that the\ncomplexity of conventional optimization algorithms in the literature often\nexponentially increases with the number of transmit antennas and communication\nusers in the wireless system. Therefore, in the large scale regime, the\nastronomically large complexity of these optimization algorithms prohibits\ntheir use and prevents assessing large scale wireless systems performance under\noptimized conditions. To overcome this limitation, this work proposes instead\nthe use of an unsupervised meta-learning based approach to directly perform\nnon-convex optimization at significantly reduced complexity. To demonstrate the\neffectiveness of the proposed meta-learning based solution, the sum-rate (SR)\nmaximization problem for the following three emerging 6G technologies is\ncontemplated: hierarchical rate-splitting multiple access (H-RSMA), integrated\nsensing and communication (ISAC), and beyond-diagonal reconfigurable\nintelligent surfaces (BD-RIS). Through numerical results, it is demonstrated\nthat the proposed meta-learning based optimization framework is able to\nsuccessfully optimize the performance and also reveal unknown aspects of the\noperation in the large scale regime for the considered three 6G technologies.\n","authors":["Rafael Cerna Loli","Bruno Clerckx"],"pdf_url":"https://arxiv.org/pdf/2407.01823v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00285v2","updated":"2024-07-03T10:51:56Z","published":"2023-11-01T04:36:18Z","title":"Mixture-of-Experts for Open Set Domain Adaptation: A Dual-Space\n Detection Approach","summary":" Open Set Domain Adaptation (OSDA) aims to cope with the distribution and\nlabel shifts between the source and target domains simultaneously, performing\naccurate classification for known classes while identifying unknown class\nsamples in the target domain. Most existing OSDA approaches, depending on the\nfinal image feature space of deep models, require manually-tuned thresholds,\nand may easily misclassify unknown samples as known classes. Mixture-of-Experts\n(MoE) could be a remedy. Within a MoE, different experts handle distinct input\nfeatures, producing unique expert routing patterns for various classes in a\nrouting feature space. As a result, unknown class samples may display different\nexpert routing patterns to known classes. In this paper, we propose Dual-Space\nDetection, which exploits the inconsistencies between the image feature space\nand the routing feature space to detect unknown class samples without any\nthreshold. Graph Router is further introduced to better make use of the spatial\ninformation among image patches. Experiments on three different datasets\nvalidated the effectiveness and superiority of our approach.\n","authors":["Zhenbang Du","Jiayu An","Yunlu Tu","Jiahao Hong","Dongrui Wu"],"pdf_url":"https://arxiv.org/pdf/2311.00285v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.02987v1","updated":"2024-07-03T10:38:40Z","published":"2024-07-03T10:38:40Z","title":"LoRA-Guard: Parameter-Efficient Guardrail Adaptation for Content\n Moderation of Large Language Models","summary":" Guardrails have emerged as an alternative to safety alignment for content\nmoderation of large language models (LLMs). Existing model-based guardrails\nhave not been designed for resource-constrained computational portable devices,\nsuch as mobile phones, more and more of which are running LLM-based\napplications locally. We introduce LoRA-Guard, a parameter-efficient guardrail\nadaptation method that relies on knowledge sharing between LLMs and guardrail\nmodels. LoRA-Guard extracts language features from the LLMs and adapts them for\nthe content moderation task using low-rank adapters, while a dual-path design\nprevents any performance degradation on the generative task. We show that\nLoRA-Guard outperforms existing approaches with 100-1000x lower parameter\noverhead while maintaining accuracy, enabling on-device content moderation.\n","authors":["Hayder Elesedy","Pedro M. Esperança","Silviu Vlad Oprea","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2407.02987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09684v2","updated":"2024-07-03T10:32:51Z","published":"2024-06-14T03:11:01Z","title":"Explainable AI for Comparative Analysis of Intrusion Detection Models","summary":" Explainable Artificial Intelligence (XAI) has become a widely discussed\ntopic, the related technologies facilitate better understanding of conventional\nblack-box models like Random Forest, Neural Networks and etc. However,\ndomain-specific applications of XAI are still insufficient. To fill this gap,\nthis research analyzes various machine learning models to the tasks of binary\nand multi-class classification for intrusion detection from network traffic on\nthe same dataset using occlusion sensitivity. The models evaluated include\nLinear Regression, Logistic Regression, Linear Support Vector Machine (SVM),\nK-Nearest Neighbors (KNN), Random Forest, Decision Trees, and Multi-Layer\nPerceptrons (MLP). We trained all models to the accuracy of 90\\% on the\nUNSW-NB15 Dataset. We found that most classifiers leverage only less than three\ncritical features to achieve such accuracies, indicating that effective feature\nengineering could actually be far more important for intrusion detection than\napplying complicated models. We also discover that Random Forest provides the\nbest performance in terms of accuracy, time efficiency and robustness. Data and\ncode available at https://github.com/pcwhy/XML-IntrusionDetection.git\n","authors":["Pap M. Corea","Yongxin Liu","Jian Wang","Shuteng Niu","Houbing Song"],"pdf_url":"https://arxiv.org/pdf/2406.09684v2.pdf","comment":"Submitted to IEEE MeditCom 2024 - WS-05"},{"id":"http://arxiv.org/abs/2407.02984v1","updated":"2024-07-03T10:31:30Z","published":"2024-07-03T10:31:30Z","title":"Semantically Rich Local Dataset Generation for Explainable AI in\n Genomics","summary":" Black box deep learning models trained on genomic sequences excel at\npredicting the outcomes of different gene regulatory mechanisms. Therefore,\ninterpreting these models may provide novel insights into the underlying\nbiology, supporting downstream biomedical applications. Due to their\ncomplexity, interpretable surrogate models can only be built for local\nexplanations (e.g., a single instance). However, accomplishing this requires\ngenerating a dataset in the neighborhood of the input, which must maintain\nsyntactic similarity to the original data while introducing semantic\nvariability in the model's predictions. This task is challenging due to the\ncomplex sequence-to-function relationship of DNA.\n We propose using Genetic Programming to generate datasets by evolving\nperturbations in sequences that contribute to their semantic diversity. Our\ncustom, domain-guided individual representation effectively constrains\nsyntactic similarity, and we provide two alternative fitness functions that\npromote diversity with no computational effort. Applied to the RNA splicing\ndomain, our approach quickly achieves good diversity and significantly\noutperforms a random baseline in exploring the search space, as shown by our\nproof-of-concept, short RNA sequence. Furthermore, we assess its\ngeneralizability and demonstrate scalability to larger sequences, resulting in\na $\\approx$30\\% improvement over the baseline.\n","authors":["Pedro Barbosa","Rosina Savisaar","Alcides Fonseca"],"pdf_url":"https://arxiv.org/pdf/2407.02984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15512v2","updated":"2024-07-03T10:23:01Z","published":"2024-05-24T12:56:18Z","title":"ChatGPT Code Detection: Techniques for Uncovering the Source of Code","summary":" In recent times, large language models (LLMs) have made significant strides\nin generating computer code, blurring the lines between code created by humans\nand code produced by artificial intelligence (AI). As these technologies evolve\nrapidly, it is crucial to explore how they influence code generation,\nespecially given the risk of misuse in areas like higher education. This paper\nexplores this issue by using advanced classification techniques to\ndifferentiate between code written by humans and that generated by ChatGPT, a\ntype of LLM. We employ a new approach that combines powerful embedding features\n(black-box) with supervised learning algorithms - including Deep Neural\nNetworks, Random Forests, and Extreme Gradient Boosting - to achieve this\ndifferentiation with an impressive accuracy of 98%. For the successful\ncombinations, we also examine their model calibration, showing that some of the\nmodels are extremely well calibrated. Additionally, we present white-box\nfeatures and an interpretable Bayes classifier to elucidate critical\ndifferences between the code sources, enhancing the explainability and\ntransparency of our approach. Both approaches work well but provide at most\n85-88% accuracy. We also show that untrained humans solve the same task not\nbetter than random guessing. This study is crucial in understanding and\nmitigating the potential risks associated with using AI in code generation,\nparticularly in the context of higher education, software development, and\ncompetitive programming.\n","authors":["Marc Oedingen","Raphael C. Engelhardt","Robin Denz","Maximilian Hammer","Wolfgang Konen"],"pdf_url":"https://arxiv.org/pdf/2405.15512v2.pdf","comment":"Accepted for publication in MDPI AI Journal"},{"id":"http://arxiv.org/abs/2405.18925v2","updated":"2024-07-03T10:22:04Z","published":"2024-05-29T09:29:39Z","title":"Federated Continual Learning Goes Online: Leveraging Uncertainty for\n Modality-Agnostic Class-Incremental Learning","summary":" Given the ability to model more realistic and dynamic problems, Federated\nContinual Learning (FCL) has been increasingly investigated recently. A\nwell-known problem encountered in this setting is the so-called catastrophic\nforgetting, for which the learning model is inclined to focus on more recent\ntasks while forgetting the previously learned knowledge. The majority of the\ncurrent approaches in FCL propose generative-based solutions to solve said\nproblem. However, this setting requires multiple training epochs over the data,\nimplying an offline setting where datasets are stored locally and remain\nunchanged over time. Furthermore, the proposed solutions are tailored for\nvision tasks solely. To overcome these limitations, we propose a new\nmodality-agnostic approach to deal with the online scenario where new data\narrive in streams of mini-batches that can only be processed once. To solve\ncatastrophic forgetting, we propose an uncertainty-aware memory-based approach.\nIn particular, we suggest using an estimator based on the Bregman Information\n(BI) to compute the model's variance at the sample level. Through measures of\npredictive uncertainty, we retrieve samples with specific characteristics, and\n- by retraining the model on such samples - we demonstrate the potential of\nthis approach to reduce the forgetting effect in realistic settings.\n","authors":["Giuseppe Serra","Florian Buettner"],"pdf_url":"https://arxiv.org/pdf/2405.18925v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02974v1","updated":"2024-07-03T10:14:33Z","published":"2024-07-03T10:14:33Z","title":"IM-MoCo: Self-supervised MRI Motion Correction using Motion-Guided\n Implicit Neural Representations","summary":" Motion artifacts in Magnetic Resonance Imaging (MRI) arise due to relatively\nlong acquisition times and can compromise the clinical utility of acquired\nimages. Traditional motion correction methods often fail to address severe\nmotion, leading to distorted and unreliable results. Deep Learning (DL)\nalleviated such pitfalls through generalization with the cost of vanishing\nstructures and hallucinations, making it challenging to apply in the medical\nfield where hallucinated structures can tremendously impact the diagnostic\noutcome. In this work, we present an instance-wise motion correction pipeline\nthat leverages motion-guided Implicit Neural Representations (INRs) to mitigate\nthe impact of motion artifacts while retaining anatomical structure. Our method\nis evaluated using the NYU fastMRI dataset with different degrees of simulated\nmotion severity. For the correction alone, we can improve over state-of-the-art\nimage reconstruction methods by $+5\\%$ SSIM, $+5\\:db$ PSNR, and $+14\\%$\nHaarPSI. Clinical relevance is demonstrated by a subsequent experiment, where\nour method improves classification outcomes by at least $+1.5$ accuracy\npercentage points compared to motion-corrupted images.\n","authors":["Ziad Al-Haj Hemidi","Christian Weihsbach","Mattias P. Heinrich"],"pdf_url":"https://arxiv.org/pdf/2407.02974v1.pdf","comment":"Submitted to MICCAI 2024 (Before peer review version)"},{"id":"http://arxiv.org/abs/2403.03777v3","updated":"2024-07-03T10:02:39Z","published":"2024-03-06T15:15:42Z","title":"ENOT: Expectile Regularization for Fast and Accurate Training of Neural\n Optimal Transport","summary":" We present a new approach for Neural Optimal Transport (NOT) training\nprocedure, capable of accurately and efficiently estimating optimal\ntransportation plan via specific regularization on dual Kantorovich potentials.\nThe main bottleneck of existing NOT solvers is associated with the procedure of\nfinding a near-exact approximation of the conjugate operator (i.e., the\nc-transform), which is done either by optimizing over non-convex max-min\nobjectives or by the computationally intensive fine-tuning of the initial\napproximated prediction. We resolve both issues by proposing a new,\ntheoretically justified loss in the form of expectile regularisation which\nenforces binding conditions on the learning process of dual potentials. Such a\nregularization provides the upper bound estimation over the distribution of\npossible conjugate potentials and makes the learning stable, completely\neliminating the need for additional extensive fine-tuning. Proposed method,\ncalled Expectile-Regularised Neural Optimal Transport (ENOT), outperforms\nprevious state-of-the-art approaches on the established Wasserstein-2 benchmark\ntasks by a large margin (up to a 3-fold improvement in quality and up to a\n10-fold improvement in runtime). Moreover, we showcase performance of ENOT for\nvarying cost functions on different tasks such as image generation, showing\nrobustness of proposed algorithm. OTT-JAX library includes our implementation\nof ENOT algorithm https://ott-jax.readthedocs.io/en/latest/tutorials/ENOT.html\n","authors":["Nazar Buzun","Maksim Bobrin","Dmitry V. Dylov"],"pdf_url":"https://arxiv.org/pdf/2403.03777v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02961v1","updated":"2024-07-03T09:54:58Z","published":"2024-07-03T09:54:58Z","title":"Towards a Scalable Reference-Free Evaluation of Generative Models","summary":" While standard evaluation scores for generative models are mostly\nreference-based, a reference-dependent assessment of generative models could be\ngenerally difficult due to the unavailability of applicable reference datasets.\nRecently, the reference-free entropy scores, VENDI and RKE, have been proposed\nto evaluate the diversity of generated data. However, estimating these scores\nfrom data leads to significant computational costs for large-scale generative\nmodels. In this work, we leverage the random Fourier features framework to\nreduce the computational price and propose the Fourier-based Kernel Entropy\nApproximation (FKEA) method. We utilize FKEA's approximated eigenspectrum of\nthe kernel matrix to efficiently estimate the mentioned entropy scores.\nFurthermore, we show the application of FKEA's proxy eigenvectors to reveal the\nmethod's identified modes in evaluating the diversity of produced samples. We\nprovide a stochastic implementation of the FKEA assessment algorithm with a\ncomplexity $O(n)$ linearly growing with sample size $n$. We extensively\nevaluate FKEA's numerical performance in application to standard image, text,\nand video datasets. Our empirical results indicate the method's scalability and\ninterpretability applied to large-scale generative models. The codebase is\navailable at https://github.com/aziksh-ospanov/FKEA.\n","authors":["Azim Ospanov","Jingwei Zhang","Mohammad Jalali","Xuenan Cao","Andrej Bogdanov","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2407.02961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02960v1","updated":"2024-07-03T09:54:08Z","published":"2024-07-03T09:54:08Z","title":"ObfuscaTune: Obfuscated Offsite Fine-tuning and Inference of Proprietary\n LLMs on Private Datasets","summary":" This work addresses the timely yet underexplored problem of performing\ninference and finetuning of a proprietary LLM owned by a model provider entity\non the confidential/private data of another data owner entity, in a way that\nensures the confidentiality of both the model and the data. Hereby, the\nfinetuning is conducted offsite, i.e., on the computation infrastructure of a\nthird-party cloud provider. We tackle this problem by proposing ObfuscaTune, a\nnovel, efficient and fully utility-preserving approach that combines a simple\nyet effective obfuscation technique with an efficient usage of confidential\ncomputing (only 5% of the model parameters are placed on TEE). We empirically\ndemonstrate the effectiveness of ObfuscaTune by validating it on GPT-2 models\nwith different sizes on four NLP benchmark datasets. Finally, we compare to a\nna\\\"ive version of our approach to highlight the necessity of using random\nmatrices with low condition numbers in our approach to reduce errors induced by\nthe obfuscation.\n","authors":["Ahmed Frikha","Nassim Walha","Ricardo Mendes","Krishna Kanth Nakka","Xue Jiang","Xuebing Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02960v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.02956v1","updated":"2024-07-03T09:49:03Z","published":"2024-07-03T09:49:03Z","title":"IncogniText: Privacy-enhancing Conditional Text Anonymization via\n LLM-based Private Attribute Randomization","summary":" In this work, we address the problem of text anonymization where the goal is\nto prevent adversaries from correctly inferring private attributes of the\nauthor, while keeping the text utility, i.e., meaning and semantics. We propose\nIncogniText, a technique that anonymizes the text to mislead a potential\nadversary into predicting a wrong private attribute value. Our empirical\nevaluation shows a reduction of private attribute leakage by more than 90%.\nFinally, we demonstrate the maturity of IncogniText for real-world applications\nby distilling its anonymization capability into a set of LoRA parameters\nassociated with an on-device model.\n","authors":["Ahmed Frikha","Nassim Walha","Krishna Kanth Nakka","Ricardo Mendes","Xue Jiang","Xuebing Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02956v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.15935v4","updated":"2024-07-03T09:28:50Z","published":"2024-01-29T07:50:28Z","title":"MLEM: Generative and Contrastive Learning as Distinct Modalities for\n Event Sequences","summary":" This study explores the application of self-supervised learning techniques\nfor event sequences. It is a key modality in various applications such as\nbanking, e-commerce, and healthcare. However, there is limited research on\nself-supervised learning for event sequences, and methods from other domains\nlike images, texts, and speech may not easily transfer. To determine the most\nsuitable approach, we conduct a detailed comparative analysis of previously\nidentified best-performing methods. We find that neither the contrastive nor\ngenerative method is superior. Our assessment includes classifying event\nsequences, predicting the next event, and evaluating embedding quality. These\nresults further highlight the potential benefits of combining both methods.\nGiven the lack of research on hybrid models in this domain, we initially adapt\nthe baseline model from another domain. However, upon observing its\nunderperformance, we develop a novel method called the Multimodal-Learning\nEvent Model (MLEM). MLEM treats contrastive learning and generative modeling as\ndistinct yet complementary modalities, aligning their embeddings. The results\nof our study demonstrate that combining contrastive and generative approaches\ninto one procedure with MLEM achieves superior performance across multiple\nmetrics.\n","authors":["Viktor Moskvoretskii","Dmitry Osin","Egor Shvetsov","Igor Udovichenko","Maxim Zhelnin","Andrey Dukhovny","Anna Zhimerikina","Evgeny Burnaev"],"pdf_url":"https://arxiv.org/pdf/2401.15935v4.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.02943v1","updated":"2024-07-03T09:20:04Z","published":"2024-07-03T09:20:04Z","title":"PII-Compass: Guiding LLM training data extraction prompts towards the\n target PII via grounding","summary":" The latest and most impactful advances in large models stem from their\nincreased size. Unfortunately, this translates into an improved memorization\ncapacity, raising data privacy concerns. Specifically, it has been shown that\nmodels can output personal identifiable information (PII) contained in their\ntraining data. However, reported PIII extraction performance varies widely, and\nthere is no consensus on the optimal methodology to evaluate this risk,\nresulting in underestimating realistic adversaries. In this work, we\nempirically demonstrate that it is possible to improve the extractability of\nPII by over ten-fold by grounding the prefix of the manually constructed\nextraction prompt with in-domain data. Our approach, PII-Compass, achieves\nphone number extraction rates of 0.92%, 3.9%, and 6.86% with 1, 128, and 2308\nqueries, respectively, i.e., the phone number of 1 person in 15 is extractable.\n","authors":["Krishna Kanth Nakka","Ahmed Frikha","Ricardo Mendes","Xue Jiang","Xuebing Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.02943v1.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2312.13910v2","updated":"2024-07-03T08:54:58Z","published":"2023-12-21T14:55:21Z","title":"Multi-Agent Probabilistic Ensembles with Trajectory Sampling for\n Connected Autonomous Vehicles","summary":" Autonomous Vehicles (AVs) have attracted significant attention in recent\nyears and Reinforcement Learning (RL) has shown remarkable performance in\nimproving the autonomy of vehicles. In that regard, the widely adopted\nModel-Free RL (MFRL) promises to solve decision-making tasks in connected AVs\n(CAVs), contingent on the readiness of a significant amount of data samples for\ntraining. Nevertheless, it might be infeasible in practice and possibly lead to\nlearning instability. In contrast, Model-Based RL (MBRL) manifests itself in\nsample-efficient learning, but the asymptotic performance of MBRL might lag\nbehind the state-of-the-art MFRL algorithms. Furthermore, most studies for CAVs\nare limited to the decision-making of a single AV only, thus underscoring the\nperformance due to the absence of communications. In this study, we try to\naddress the decision-making problem of multiple CAVs with limited\ncommunications and propose a decentralized Multi-Agent Probabilistic Ensembles\nwith Trajectory Sampling algorithm MA-PETS. In particular, in order to better\ncapture the uncertainty of the unknown environment, MA-PETS leverages\nProbabilistic Ensemble (PE) neural networks to learn from communicated samples\namong neighboring CAVs. Afterwards, MA-PETS capably develops Trajectory\nSampling (TS)-based model-predictive control for decision-making. On this\nbasis, we derive the multi-agent group regret bound affected by the number of\nagents within the communication range and mathematically validate that\nincorporating effective information exchange among agents into the multi-agent\nlearning scheme contributes to reducing the group regret bound in the worst\ncase. Finally, we empirically demonstrate the superiority of MA-PETS in terms\nof the sample efficiency comparable to MFBL.\n","authors":["Ruoqi Wen","Jiahao Huang","Rongpeng Li","Guoru Ding","Zhifeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.13910v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02914v1","updated":"2024-07-03T08:45:17Z","published":"2024-07-03T08:45:17Z","title":"The More the Merrier? Navigating Accuracy vs. Energy Efficiency Design\n Trade-Offs in Ensemble Learning Systems","summary":" Background: Machine learning (ML) model composition is a popular technique to\nmitigate shortcomings of a single ML model and to design more effective\nML-enabled systems. While ensemble learning, i.e., forwarding the same request\nto several models and fusing their predictions, has been studied extensively\nfor accuracy, we have insufficient knowledge about how to design\nenergy-efficient ensembles. Objective: We therefore analyzed three types of\ndesign decisions for ensemble learning regarding a potential trade-off between\naccuracy and energy consumption: a) ensemble size, i.e., the number of models\nin the ensemble, b) fusion methods (majority voting vs. a meta-model), and c)\npartitioning methods (whole-dataset vs. subset-based training). Methods: By\ncombining four popular ML algorithms for classification in different ensembles,\nwe conducted a full factorial experiment with 11 ensembles x 4 datasets x 2\nfusion methods x 2 partitioning methods (176 combinations). For each\ncombination, we measured accuracy (F1-score) and energy consumption in J (for\nboth training and inference). Results: While a larger ensemble size\nsignificantly increased energy consumption (size 2 ensembles consumed 37.49%\nless energy than size 3 ensembles, which in turn consumed 26.96% less energy\nthan the size 4 ensembles), it did not significantly increase accuracy.\nFurthermore, majority voting outperformed meta-model fusion both in terms of\naccuracy (Cohen's d of 0.38) and energy consumption (Cohen's d of 0.92).\nLastly, subset-based training led to significantly lower energy consumption\n(Cohen's d of 0.91), while training on the whole dataset did not increase\naccuracy significantly. Conclusions: From a Green AI perspective, we recommend\ndesigning ensembles of small size (2 or maximum 3 models), using subset-based\ntraining, majority voting, and energy-efficient ML algorithms like decision\ntrees, Naive Bayes, or KNN.\n","authors":["Rafiullah Omar","Justus Bogner","Henry Muccini","Patricia Lago","Silverio Martínez-Fernández","Xavier Franch"],"pdf_url":"https://arxiv.org/pdf/2407.02914v1.pdf","comment":"Currently under review at a journal"},{"id":"http://arxiv.org/abs/2407.02913v1","updated":"2024-07-03T08:38:14Z","published":"2024-07-03T08:38:14Z","title":"SFC: Achieve Accurate Fast Convolution under Low-precision Arithmetic","summary":" Fast convolution algorithms, including Winograd and FFT, can efficiently\naccelerate convolution operations in deep models. However, these algorithms\ndepend on high-precision arithmetic to maintain inference accuracy, which\nconflicts with the model quantization. To resolve this conflict and further\nimprove the efficiency of quantized convolution, we proposes SFC, a new algebra\ntransform for fast convolution by extending the Discrete Fourier Transform\n(DFT) with symbolic computing, in which only additions are required to perform\nthe transformation at specific transform points, avoiding the calculation of\nirrational number and reducing the requirement for precision. Additionally, we\nenhance convolution efficiency by introducing correction terms to convert\ninvalid circular convolution outputs of the Fourier method into effective ones.\nThe numerical error analysis is presented for the first time in this type of\nwork and proves that our algorithms can provide a 3.68x multiplication\nreduction for 3x3 convolution, while the Winograd algorithm only achieves a\n2.25x reduction with similarly low numerical errors. Experiments carried out on\nbenchmarks and FPGA show that our new algorithms can further improve the\ncomputation efficiency of quantized models while maintaining accuracy,\nsurpassing both the quantization-alone method and existing works on fast\nconvolution quantization.\n","authors":["Liulu He","Yufei Zhao","Rui Gao","Yuan Du","Li Du"],"pdf_url":"https://arxiv.org/pdf/2407.02913v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.10086v2","updated":"2024-07-03T08:31:45Z","published":"2024-02-08T09:08:44Z","title":"Explainable AI for Safe and Trustworthy Autonomous Driving: A Systematic\n Review","summary":" Artificial Intelligence (AI) shows promising applications for the perception\nand planning tasks in autonomous driving (AD) due to its superior performance\ncompared to conventional methods. However, inscrutable AI systems exacerbate\nthe existing challenge of safety assurance of AD. One way to mitigate this\nchallenge is to utilize explainable AI (XAI) techniques. To this end, we\npresent the first comprehensive systematic literature review of explainable\nmethods for safe and trustworthy AD. We begin by analyzing the requirements for\nAI in the context of AD, focusing on three key aspects: data, model, and\nagency. We find that XAI is fundamental to meeting these requirements. Based on\nthis, we explain the sources of explanations in AI and describe a taxonomy of\nXAI. We then identify five key contributions of XAI for safe and trustworthy AI\nin AD, which are interpretable design, interpretable surrogate models,\ninterpretable monitoring, auxiliary explanations, and interpretable validation.\nFinally, we propose a modular framework called SafeX to integrate these\ncontributions, enabling explanation delivery to users while simultaneously\nensuring the safety of AI models.\n","authors":["Anton Kuznietsov","Balint Gyevnar","Cheng Wang","Steven Peters","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2402.10086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02904v1","updated":"2024-07-03T08:23:02Z","published":"2024-07-03T08:23:02Z","title":"The Shortcomings of Force-from-Motion in Robot Learning","summary":" Robotic manipulation requires accurate motion and physical interaction\ncontrol. However, current robot learning approaches focus on motion-centric\naction spaces that do not explicitly give the policy control over the\ninteraction. In this paper, we discuss the repercussions of this choice and\nargue for more interaction-explicit action spaces in robot learning.\n","authors":["Elie Aljalbout","Felix Frank","Patrick van der Smagt","Alexandros Paraschos"],"pdf_url":"https://arxiv.org/pdf/2407.02904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08173v2","updated":"2024-07-03T08:21:19Z","published":"2023-08-16T07:05:41Z","title":"Expressivity of Graph Neural Networks Through the Lens of Adversarial\n Robustness","summary":" We perform the first adversarial robustness study into Graph Neural Networks\n(GNNs) that are provably more powerful than traditional Message Passing Neural\nNetworks (MPNNs). In particular, we use adversarial robustness as a tool to\nuncover a significant gap between their theoretically possible and empirically\nachieved expressive power. To do so, we focus on the ability of GNNs to count\nspecific subgraph patterns, which is an established measure of expressivity,\nand extend the concept of adversarial robustness to this task. Based on this,\nwe develop efficient adversarial attacks for subgraph counting and show that\nmore powerful GNNs fail to generalize even to small perturbations to the\ngraph's structure. Expanding on this, we show that such architectures also fail\nto count substructures on out-of-distribution graphs.\n","authors":["Francesco Campi","Lukas Gosch","Tom Wollschläger","Yan Scholten","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2308.08173v2.pdf","comment":"Published in ${2}^{nd}$ AdvML Frontiers workshop at ${40}^{th}$\n International Conference on Machine Learning (ICML)"},{"id":"http://arxiv.org/abs/2407.02900v1","updated":"2024-07-03T08:20:27Z","published":"2024-07-03T08:20:27Z","title":"Self-supervised Vision Transformer are Scalable Generative Models for\n Domain Generalization","summary":" Despite notable advancements, the integration of deep learning (DL)\ntechniques into impactful clinical applications, particularly in the realm of\ndigital histopathology, has been hindered by challenges associated with\nachieving robust generalization across diverse imaging domains and\ncharacteristics. Traditional mitigation strategies in this field such as data\naugmentation and stain color normalization have proven insufficient in\naddressing this limitation, necessitating the exploration of alternative\nmethodologies. To this end, we propose a novel generative method for domain\ngeneralization in histopathology images. Our method employs a generative,\nself-supervised Vision Transformer to dynamically extract characteristics of\nimage patches and seamlessly infuse them into the original images, thereby\ncreating novel, synthetic images with diverse attributes. By enriching the\ndataset with such synthesized images, we aim to enhance its holistic nature,\nfacilitating improved generalization of DL models to unseen domains. Extensive\nexperiments conducted on two distinct histopathology datasets demonstrate the\neffectiveness of our proposed approach, outperforming the state of the art\nsubstantially, on the Camelyon17-wilds challenge dataset (+2%) and on a second\nepithelium-stroma dataset (+26%). Furthermore, we emphasize our method's\nability to readily scale with increasingly available unlabeled data samples and\nmore complex, higher parametric architectures. Source code is available at\nhttps://github.com/sdoerrich97/vits-are-generative-models .\n","authors":["Sebastian Doerrich","Francesco Di Salvo","Christian Ledig"],"pdf_url":"https://arxiv.org/pdf/2407.02900v1.pdf","comment":"Accepted at MICCAI 2024. This is the submitted manuscript with added\n link to github repo and funding acknowledgements. No further post submission\n improvements or corrections were integrated. Final version not published yet"},{"id":"http://arxiv.org/abs/2407.02891v1","updated":"2024-07-03T08:08:01Z","published":"2024-07-03T08:08:01Z","title":"GPTQT: Quantize Large Language Models Twice to Push the Efficiency","summary":" Due to their large size, generative Large Language Models (LLMs) require\nsignificant computing and storage resources. This paper introduces a new\npost-training quantization method, GPTQT, to reduce memory usage and enhance\nprocessing speed by expressing the weight of LLM in 3bit/2bit. Practice has\nshown that minimizing the quantization error of weights is ineffective, leading\nto overfitting. Therefore, GPTQT employs a progressive two-step approach:\ninitially quantizing weights using Linear quantization to a relatively high\nbit, followed by converting obtained int weight to lower bit binary coding. A\nre-explore strategy is proposed to optimize initial scaling factor. During\ninference, these steps are merged into pure binary coding, enabling efficient\ncomputation. Testing across various models and datasets confirms GPTQT's\neffectiveness. Compared to the strong 3-bit quantization baseline, GPTQT\nfurther reduces perplexity by 4.01 on opt-66B and increases speed by 1.24 times\non opt-30b. The results on Llama2 show that GPTQT is currently the best binary\ncoding quantization method for such kind of LLMs.\n","authors":["Yipin Guo","Yilin Lang","Qinyuan Ren"],"pdf_url":"https://arxiv.org/pdf/2407.02891v1.pdf","comment":"Accepted by 11th IEEE International Conference on Cybernetics and\n Intelligent Systems"},{"id":"http://arxiv.org/abs/2210.01302v3","updated":"2024-07-03T08:06:56Z","published":"2022-10-04T01:40:31Z","title":"Nuisances via Negativa: Adjusting for Spurious Correlations via Data\n Augmentation","summary":" In prediction tasks, there exist features that are related to the label in\nthe same way across different settings for that task; these are semantic\nfeatures or semantics. Features with varying relationships to the label are\nnuisances. For example, in detecting cows from natural images, the shape of the\nhead is semantic but because images of cows often have grass backgrounds but\nnot always, the background is a nuisance. Models that exploit nuisance-label\nrelationships face performance degradation when these relationships change.\nBuilding models robust to such changes requires additional knowledge beyond\nsamples of the features and labels. For example, existing work uses annotations\nof nuisances or assumes ERM-trained models depend on nuisances. Approaches to\nintegrate new kinds of additional knowledge enlarge the settings where robust\nmodels can be built. We develop an approach to use knowledge about the\nsemantics by corrupting them in data, and then using the corrupted data to\nproduce models which identify correlations between nuisances and the label.\nOnce these correlations are identified, they can be used to adjust for where\nnuisances drive predictions. We study semantic corruptions in powering\ndifferent spurious-correlation avoiding methods on multiple out-of-distribution\n(OOD) tasks like classifying waterbirds, natural language inference (NLI), and\ndetecting cardiomegaly in chest X-rays.\n","authors":["Aahlad Puli","Nitish Joshi","Yoav Wald","He He","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2210.01302v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02888v1","updated":"2024-07-03T08:03:59Z","published":"2024-07-03T08:03:59Z","title":"Joint Optimization of Resource Allocation and Data Selection for Fast\n and Cost-Efficient Federated Edge Learning","summary":" Deploying federated learning at the wireless edge introduces federated edge\nlearning (FEEL). Given FEEL's limited communication resources and potential\nmislabeled data on devices, improper resource allocation or data selection can\nhurt convergence speed and increase training costs. Thus, to realize an\nefficient FEEL system, this paper emphasizes jointly optimizing resource\nallocation and data selection. Specifically, in this work, through rigorously\nmodeling the training process and deriving an upper bound on FEEL's one-round\nconvergence rate, we establish a problem of joint resource allocation and data\nselection, which, unfortunately, cannot be solved directly. Toward this end, we\nequivalently transform the original problem into a solvable form via a variable\nsubstitution and then break it into two subproblems, that is, the resource\nallocation problem and the data selection problem. The two subproblems are\nmixed-integer non-convex and integer non-convex problems, respectively, and\nachieving their optimal solutions is a challenging task. Based on the matching\ntheory and applying the convex-concave procedure and gradient projection\nmethods, we devise a low-complexity suboptimal algorithm for the two\nsubproblems, respectively. Finally, the superiority of our proposed scheme of\njoint resource allocation and data selection is validated by numerical results.\n","authors":["Yunjian Jia","Zhen Huang","Jiping Yan","Yulu Zhang","Kun Luo","Wanli Wen"],"pdf_url":"https://arxiv.org/pdf/2407.02888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08847v2","updated":"2024-07-03T07:57:53Z","published":"2024-01-16T21:45:08Z","title":"RIDGE: Reproducibility, Integrity, Dependability, Generalizability, and\n Efficiency Assessment of Medical Image Segmentation Models","summary":" Deep learning techniques hold immense promise for advancing medical image\nanalysis, particularly in tasks like image segmentation, where precise\nannotation of regions or volumes of interest within medical images is crucial\nbut manually laborious and prone to interobserver and intraobserver biases. As\nsuch, deep learning approaches could provide automated solutions for such\napplications. However, the potential of these techniques is often undermined by\nchallenges in reproducibility and generalizability, which are key barriers to\ntheir clinical adoption. This paper introduces the RIDGE checklist, a\ncomprehensive framework designed to assess the Reproducibility, Integrity,\nDependability, Generalizability, and Efficiency of deep learning-based medical\nimage segmentation models. The RIDGE checklist is not just a tool for\nevaluation but also a guideline for researchers striving to improve the quality\nand transparency of their work. By adhering to the principles outlined in the\nRIDGE checklist, researchers can ensure that their developed segmentation\nmodels are robust, scientifically valid, and applicable in a clinical setting.\n","authors":["Farhad Maleki","Linda Moy","Reza Forghani","Tapotosh Ghosh","Katie Ovens","Steve Langer","Pouria Rouzrokh","Bardia Khosravi","Ali Ganjizadeh","Daniel Warren","Roxana Daneshjou","Mana Moassefi","Atlas Haddadi Avval","Susan Sotardi","Neil Tenenholtz","Felipe Kitamura","Timothy Kline"],"pdf_url":"https://arxiv.org/pdf/2401.08847v2.pdf","comment":"24 pages, 1 Figure, 2 Table"},{"id":"http://arxiv.org/abs/2307.02129v5","updated":"2024-07-03T07:57:00Z","published":"2023-07-05T09:11:09Z","title":"How Deep Neural Networks Learn Compositional Data: The Random Hierarchy\n Model","summary":" Deep learning algorithms demonstrate a surprising ability to learn\nhigh-dimensional tasks from limited examples. This is commonly attributed to\nthe depth of neural networks, enabling them to build a hierarchy of abstract,\nlow-dimensional data representations. However, how many training examples are\nrequired to learn such representations remains unknown. To quantitatively study\nthis question, we introduce the Random Hierarchy Model: a family of synthetic\ntasks inspired by the hierarchical structure of language and images. The model\nis a classification task where each class corresponds to a group of high-level\nfeatures, chosen among several equivalent groups associated with the same\nclass. In turn, each feature corresponds to a group of sub-features chosen\namong several equivalent ones and so on, following a hierarchy of composition\nrules. We find that deep networks learn the task by developing internal\nrepresentations invariant to exchanging equivalent groups. Moreover, the number\nof data required corresponds to the point where correlations between low-level\nfeatures and classes become detectable. Overall, our results indicate how deep\nnetworks overcome the curse of dimensionality by building invariant\nrepresentations, and provide an estimate of the number of data required to\nlearn a hierarchical task.\n","authors":["Francesco Cagnetta","Leonardo Petrini","Umberto M. Tomasini","Alessandro Favero","Matthieu Wyart"],"pdf_url":"https://arxiv.org/pdf/2307.02129v5.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.02881v1","updated":"2024-07-03T07:56:51Z","published":"2024-07-03T07:56:51Z","title":"ShiftAddAug: Augment Multiplication-Free Tiny Neural Network with Hybrid\n Computation","summary":" Operators devoid of multiplication, such as Shift and Add, have gained\nprominence for their compatibility with hardware. However, neural networks\n(NNs) employing these operators typically exhibit lower accuracy compared to\nconventional NNs with identical structures. ShiftAddAug uses costly\nmultiplication to augment efficient but less powerful multiplication-free\noperators, improving performance without any inference overhead. It puts a\nShiftAdd tiny NN into a large multiplicative model and encourages it to be\ntrained as a sub-model to obtain additional supervision. In order to solve the\nweight discrepancy problem between hybrid operators, a new weight sharing\nmethod is proposed. Additionally, a novel two stage neural architecture search\nis used to obtain better augmentation effects for smaller but stronger\nmultiplication-free tiny neural networks. The superiority of ShiftAddAug is\nvalidated through experiments in image classification and semantic\nsegmentation, consistently delivering noteworthy enhancements. Remarkably, it\nsecures up to a 4.95% increase in accuracy on the CIFAR100 compared to its\ndirectly trained counterparts, even surpassing the performance of\nmultiplicative NNs.\n","authors":["Yipin Guo","Zihao Li","Yilin Lang","Qinyuan Ren"],"pdf_url":"https://arxiv.org/pdf/2407.02881v1.pdf","comment":"Accepted by 2024 CVPR Workshop : Efficient Deep Learning for Computer\n Vision"},{"id":"http://arxiv.org/abs/2407.02880v1","updated":"2024-07-03T07:54:08Z","published":"2024-07-03T07:54:08Z","title":"Knowledge Composition using Task Vectors with Learned Anisotropic\n Scaling","summary":" Pre-trained models produce strong generic representations that can be adapted\nvia fine-tuning. The learned weight difference relative to the pre-trained\nmodel, known as a task vector, characterises the direction and stride of\nfine-tuning. The significance of task vectors is such that simple arithmetic\noperations on them can be used to combine diverse representations from\ndifferent domains. This paper builds on these properties of task vectors and\naims to answer (1) whether components of task vectors, particularly parameter\nblocks, exhibit similar characteristics, and (2) how such blocks can be used to\nenhance knowledge composition and transfer. To this end, we introduce aTLAS, an\nalgorithm that linearly combines parameter blocks with different learned\ncoefficients, resulting in anisotropic scaling at the task vector level. We\nshow that such linear combinations explicitly exploit the low intrinsic\ndimensionality of pre-trained models, with only a few coefficients being the\nlearnable parameters. Furthermore, composition of parameter blocks leverages\nthe already learned representations, thereby reducing the dependency on large\namounts of data. We demonstrate the effectiveness of our method in task\narithmetic, few-shot recognition and test-time adaptation, with supervised or\nunsupervised objectives. In particular, we show that (1) learned anisotropic\nscaling allows task vectors to be more disentangled, causing less interference\nin composition; (2) task vector composition excels with scarce or no labeled\ndata and is less prone to domain shift, thus leading to better\ngeneralisability; (3) mixing the most informative parameter blocks across\ndifferent task vectors prior to training can reduce the memory footprint and\nimprove the flexibility of knowledge transfer. Moreover, we show the potential\nof aTLAS as a PEFT method, particularly with less data, and demonstrate that\nits scalibility.\n","authors":["Frederic Z. Zhang","Paul Albert","Cristian Rodriguez-Opazo","Anton van den Hengel","Ehsan Abbasnejad"],"pdf_url":"https://arxiv.org/pdf/2407.02880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04032v3","updated":"2024-07-03T07:36:43Z","published":"2023-02-08T13:08:51Z","title":"A Systematic Performance Analysis of Deep Perceptual Loss Networks:\n Breaking Transfer Learning Conventions","summary":" In recent years, deep perceptual loss has been widely and successfully used\nto train machine learning models for many computer vision tasks, including\nimage synthesis, segmentation, and autoencoding. Deep perceptual loss is a type\nof loss function for images that computes the error between two images as the\ndistance between deep features extracted from a neural network. Most\napplications of the loss use pretrained networks called loss networks for deep\nfeature extraction. However, despite increasingly widespread use, the effects\nof loss network implementation on the trained models have not been studied.\n This work rectifies this through a systematic evaluation of the effect of\ndifferent pretrained loss networks on four different application areas.\nSpecifically, the work evaluates 14 different pretrained architectures with\nfour different feature extraction layers. The evaluation reveals that VGG\nnetworks without batch normalization have the best performance and that the\nchoice of feature extraction layer is at least as important as the choice of\narchitecture. The analysis also reveals that deep perceptual loss does not\nadhere to the transfer learning conventions that better ImageNet accuracy\nimplies better downstream performance and that feature extraction from the\nlater layers provides better performance.\n","authors":["Gustav Grund Pihlgren","Konstantina Nikolaidou","Prakash Chandra Chhipa","Nosheen Abid","Rajkumar Saini","Fredrik Sandin","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2302.04032v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02870v1","updated":"2024-07-03T07:34:49Z","published":"2024-07-03T07:34:49Z","title":"Membership Inference Attacks Against Time-Series Models","summary":" Analyzing time-series data that may contain personal information,\nparticularly in the medical field, presents serious privacy concerns. Sensitive\nhealth data from patients is often used to train machine-learning models for\ndiagnostics and ongoing care. Assessing the privacy risk of such models is\ncrucial to making knowledgeable decisions on whether to use a model in\nproduction, share it with third parties, or deploy it in patients homes.\nMembership Inference Attacks (MIA) are a key method for this kind of\nevaluation, however time-series prediction models have not been thoroughly\nstudied in this context. We explore existing MIA techniques on time-series\nmodels, and introduce new features, focusing on the seasonality and trend\ncomponents of the data. Seasonality is estimated using a multivariate Fourier\ntransform, and a low-degree polynomial is used to approximate trends. We\napplied these techniques to various types of time-series models, using datasets\nfrom the health domain. Our results demonstrate that these new features enhance\nthe effectiveness of MIAs in identifying membership, improving the\nunderstanding of privacy risks in medical data applications.\n","authors":["Noam Koren","Abigail Goldsteen","Ariel Farkash","Guy Amit"],"pdf_url":"https://arxiv.org/pdf/2407.02870v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2401.17653v2","updated":"2024-07-03T07:28:13Z","published":"2024-01-31T08:13:35Z","title":"A primer on synthetic health data","summary":" Recent advances in deep generative models have greatly expanded the potential\nto create realistic synthetic health datasets. These synthetic datasets aim to\npreserve the characteristics, patterns, and overall scientific conclusions\nderived from sensitive health datasets without disclosing patient identity or\nsensitive information. Thus, synthetic data can facilitate safe data sharing\nthat supports a range of initiatives including the development of new\npredictive models, advanced health IT platforms, and general project ideation\nand hypothesis development. However, many questions and challenges remain,\nincluding how to consistently evaluate a synthetic dataset's similarity and\npredictive utility in comparison to the original real dataset and risk to\nprivacy when shared. Additional regulatory and governance issues have not been\nwidely addressed. In this primer, we map the state of synthetic health data,\nincluding generation and evaluation methods and tools, existing examples of\ndeployment, the regulatory and ethical landscape, access and governance\noptions, and opportunities for further development.\n","authors":["Jennifer A Bartell","Sander Boisen Valentin","Anders Krogh","Henning Langberg","Martin Bøgsted"],"pdf_url":"https://arxiv.org/pdf/2401.17653v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02861v1","updated":"2024-07-03T07:19:41Z","published":"2024-07-03T07:19:41Z","title":"A Self-Supervised Task for Fault Detection in Satellite Multivariate\n Time Series","summary":" In the space sector, due to environmental conditions and restricted\naccessibility, robust fault detection methods are imperative for ensuring\nmission success and safeguarding valuable assets. This work proposes a novel\napproach leveraging Physics-Informed Real NVP neural networks, renowned for\ntheir ability to model complex and high-dimensional distributions, augmented\nwith a self-supervised task based on sensors' data permutation. It focuses on\nenhancing fault detection within the satellite multivariate time series. The\nexperiments involve various configurations, including pre-training with\nself-supervision, multi-task learning, and standalone self-supervised training.\nResults indicate significant performance improvements across all settings. In\nparticular, employing only the self-supervised loss yields the best overall\nresults, suggesting its efficacy in guiding the network to extract relevant\nfeatures for fault detection. This study presents a promising direction for\nimproving fault detection in space systems and warrants further exploration in\nother datasets and applications.\n","authors":["Carlo Cena","Silvia Bucci","Alessandro Balossino","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2407.02861v1.pdf","comment":"SPAICE: AI in and for Space, 2024"},{"id":"http://arxiv.org/abs/2312.12223v4","updated":"2024-07-03T07:15:51Z","published":"2023-12-19T15:11:46Z","title":"Self-Supervised Detection of Perfect and Partial Input-Dependent\n Symmetries","summary":" Group equivariance can overly constrain models if the symmetries in the group\ndiffer from those observed in data. While common methods address this by\ndetermining the appropriate level of symmetry at the dataset level, they are\nlimited to supervised settings and ignore scenarios in which multiple levels of\nsymmetry co-exist in the same dataset. In this paper, we propose a method able\nto detect the level of symmetry of each input without the need for labels. Our\nframework is general enough to accommodate different families of both\ncontinuous and discrete symmetry distributions, such as arbitrary unimodal,\nsymmetric distributions and discrete groups. We validate the effectiveness of\nour approach on synthetic datasets with different per-class levels of\nsymmetries, and demonstrate practical applications such as the detection of\nout-of-distribution symmetries. Our code is publicly available at\nhttps://github.com/aurban0/ssl-sym.\n","authors":["Alonso Urbano","David W. Romero"],"pdf_url":"https://arxiv.org/pdf/2312.12223v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02856v1","updated":"2024-07-03T07:14:25Z","published":"2024-07-03T07:14:25Z","title":"Early-Stage Anomaly Detection: A Study of Model Performance on Complete\n vs. Partial Flows","summary":" This study investigates the efficacy of machine learning models, specifically\nRandom Forest, in anomaly detection systems when trained on complete flow\nrecords and tested on partial flow data. We explore the performance disparity\nthat arises when models are applied to incomplete data typical in real-world,\nreal-time network environments. Our findings demonstrate a significant decline\nin model performance, with precision and recall dropping by up to 30\\% under\ncertain conditions when models trained on complete flows are tested against\npartial flows. Conversely, models trained and tested on consistently complete\nor partial datasets maintain robustness, highlighting the importance of dataset\nconsistency in training. The study reveals that a minimum of 7 packets in the\ntest set is required for maintaining reliable detection rates. These results\nunderscore the need for tailored training strategies that can effectively adapt\nto the dynamics of partial data, enhancing the practical applicability of\nanomaly detection systems in operational settings.\n","authors":["Adrian Pekar","Richard Jozsa"],"pdf_url":"https://arxiv.org/pdf/2407.02856v1.pdf","comment":"9 pages, 5 tables, 2 figures"},{"id":"http://arxiv.org/abs/2407.02855v1","updated":"2024-07-03T07:14:05Z","published":"2024-07-03T07:14:05Z","title":"Safe Unlearning: A Surprisingly Effective and Generalizable Solution to\n Defend Against Jailbreak Attacks","summary":" LLMs are known to be vulnerable to jailbreak attacks, even after safety\nalignment. An important observation is that, while different types of jailbreak\nattacks can generate significantly different queries, they mostly result in\nsimilar responses that are rooted in the same harmful knowledge (e.g., detailed\nsteps to make a bomb). Therefore, we conjecture that directly unlearn the\nharmful knowledge in the LLM can be a more effective way to defend against\njailbreak attacks than the mainstream supervised fine-tuning (SFT) based\napproaches. Our extensive experiments confirmed our insight and suggested\nsurprising generalizability of our unlearning-based approach: using only 20 raw\nharmful questions \\emph{without} any jailbreak prompt during training, our\nsolution reduced the Attack Success Rate (ASR) in Vicuna-7B on\n\\emph{out-of-distribution} (OOD) harmful questions wrapped with various complex\njailbreak prompts from 82.6\\% to 7.7\\%. This significantly outperforms\nLlama2-7B-Chat, which is fine-tuned on about 0.1M safety alignment samples but\nstill has an ASR of 21.9\\% even under the help of an additional safety system\nprompt. Further analysis reveals that the generalization ability of our\nsolution stems from the intrinsic relatedness among harmful responses across\nharmful questions (e.g., response patterns, shared steps and actions, and\nsimilarity among their learned representations in the LLM). Our code is\navailable at \\url{https://github.com/thu-coai/SafeUnlearning}.\n","authors":["Zhexin Zhang","Junxiao Yang","Pei Ke","Shiyao Cui","Chujie Zheng","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2407.02855v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2307.04417v4","updated":"2024-07-03T07:02:07Z","published":"2023-07-10T08:45:58Z","title":"Fairness-aware Federated Minimax Optimization with Convergence Guarantee","summary":" Federated learning (FL) has garnered considerable attention due to its\nprivacy-preserving feature. Nonetheless, the lack of freedom in managing user\ndata can lead to group fairness issues, where models are biased towards\nsensitive factors such as race or gender. To tackle this issue, this paper\nproposes a novel algorithm, fair federated averaging with augmented Lagrangian\nmethod (FFALM), designed explicitly to address group fairness issues in FL.\nSpecifically, we impose a fairness constraint on the training objective and\nsolve the minimax reformulation of the constrained optimization problem. Then,\nwe derive the theoretical upper bound for the convergence rate of FFALM. The\neffectiveness of FFALM in improving fairness is shown empirically on CelebA and\nUTKFace datasets in the presence of severe statistical heterogeneity.\n","authors":["Gerry Windiarto Mohamad Dunda","Shenghui Song"],"pdf_url":"https://arxiv.org/pdf/2307.04417v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01851v2","updated":"2024-07-03T07:01:30Z","published":"2024-07-01T23:32:25Z","title":"Meerkat: Audio-Visual Large Language Model for Grounding in Space and\n Time","summary":" Leveraging Large Language Models' remarkable proficiency in text-based tasks,\nrecent works on Multi-modal LLMs (MLLMs) extend them to other modalities like\nvision and audio. However, the progress in these directions has been mostly\nfocused on tasks that only require a coarse-grained understanding of the\naudio-visual semantics. We present Meerkat, an audio-visual LLM equipped with a\nfine-grained understanding of image and audio both spatially and temporally.\nWith a new modality alignment module based on optimal transport and a\ncross-attention module that enforces audio-visual consistency, Meerkat can\ntackle challenging tasks such as audio referred image grounding, image guided\naudio temporal localization, and audio-visual fact-checking. Moreover, we\ncarefully curate a large dataset AVFIT that comprises 3M instruction tuning\nsamples collected from open-source datasets, and introduce MeerkatBench that\nunifies five challenging audio-visual tasks. We achieve state-of-the-art\nperformance on all these downstream tasks with a relative improvement of up to\n37.12%.\n","authors":["Sanjoy Chowdhury","Sayan Nag","Subhrajyoti Dasgupta","Jun Chen","Mohamed Elhoseiny","Ruohan Gao","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2407.01851v2.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2407.02844v1","updated":"2024-07-03T06:40:26Z","published":"2024-07-03T06:40:26Z","title":"Multi-Attention Integrated Deep Learning Frameworks for Enhanced Breast\n Cancer Segmentation and Identification","summary":" Breast cancer poses a profound threat to lives globally, claiming numerous\nlives each year. Therefore, timely detection is crucial for early intervention\nand improved chances of survival. Accurately diagnosing and classifying breast\ntumors using ultrasound images is a persistent challenge in medicine, demanding\ncutting-edge solutions for improved treatment strategies. This research\nintroduces multiattention-enhanced deep learning (DL) frameworks designed for\nthe classification and segmentation of breast cancer tumors from ultrasound\nimages. A spatial channel attention mechanism is proposed for segmenting tumors\nfrom ultrasound images, utilizing a novel LinkNet DL framework with an\nInceptionResNet backbone. Following this, the paper proposes a deep\nconvolutional neural network with an integrated multi-attention framework\n(DCNNIMAF) to classify the segmented tumor as benign, malignant, or normal.\nFrom experimental results, it is observed that the segmentation model has\nrecorded an accuracy of 98.1%, with a minimal loss of 0.6%. It has also\nachieved high Intersection over Union (IoU) and Dice Coefficient scores of\n96.9% and 97.2%, respectively. Similarly, the classification model has attained\nan accuracy of 99.2%, with a low loss of 0.31%. Furthermore, the classification\nframework has achieved outstanding F1-Score, precision, and recall values of\n99.1%, 99.3%, and 99.1%, respectively. By offering a robust framework for early\ndetection and accurate classification of breast cancer, this proposed work\nsignificantly advances the field of medical image analysis, potentially\nimproving diagnostic precision and patient outcomes.\n","authors":["Pandiyaraju V","Shravan Venkatraman","Pavan Kumar S","Santhosh Malarvannan","Kannan A"],"pdf_url":"https://arxiv.org/pdf/2407.02844v1.pdf","comment":"32 pages, 18 figures, 6 tables"},{"id":"http://arxiv.org/abs/2405.00532v3","updated":"2024-07-03T06:34:31Z","published":"2024-05-01T14:05:52Z","title":"ULLER: A Unified Language for Learning and Reasoning","summary":" The field of neuro-symbolic artificial intelligence (NeSy), which combines\nlearning and reasoning, has recently experienced significant growth. There now\nare a wide variety of NeSy frameworks, each with its own specific language for\nexpressing background knowledge and how to relate it to neural networks. This\nheterogeneity hinders accessibility for newcomers and makes comparing different\nNeSy frameworks challenging. We propose a unified language for NeSy, which we\ncall ULLER, a Unified Language for LEarning and Reasoning. ULLER encompasses a\nwide variety of settings, while ensuring that knowledge described in it can be\nused in existing NeSy systems. ULLER has a neuro-symbolic first-order syntax\nfor which we provide example semantics including classical, fuzzy, and\nprobabilistic logics. We believe ULLER is a first step towards making NeSy\nresearch more accessible and comparable, paving the way for libraries that\nstreamline training and evaluation across a multitude of semantics, knowledge\nbases, and NeSy systems.\n","authors":["Emile van Krieken","Samy Badreddine","Robin Manhaeve","Eleonora Giunchiglia"],"pdf_url":"https://arxiv.org/pdf/2405.00532v3.pdf","comment":"Pre-review version. Final version accepted at NeSy 2024"},{"id":"http://arxiv.org/abs/2407.02833v1","updated":"2024-07-03T06:20:31Z","published":"2024-07-03T06:20:31Z","title":"LANE: Logic Alignment of Non-tuning Large Language Models and Online\n Recommendation Systems for Explainable Reason Generation","summary":" The explainability of recommendation systems is crucial for enhancing user\ntrust and satisfaction. Leveraging large language models (LLMs) offers new\nopportunities for comprehensive recommendation logic generation. However, in\nexisting related studies, fine-tuning LLM models for recommendation tasks\nincurs high computational costs and alignment issues with existing systems,\nlimiting the application potential of proven proprietary/closed-source LLM\nmodels, such as GPT-4. In this work, our proposed effective strategy LANE\naligns LLMs with online recommendation systems without additional LLMs tuning,\nreducing costs and improving explainability. This innovative approach addresses\nkey challenges in integrating language models with recommendation systems while\nfully utilizing the capabilities of powerful proprietary models. Specifically,\nour strategy operates through several key components: semantic embedding, user\nmulti-preference extraction using zero-shot prompting, semantic alignment, and\nexplainable recommendation generation using Chain of Thought (CoT) prompting.\nBy embedding item titles instead of IDs and utilizing multi-head attention\nmechanisms, our approach aligns the semantic features of user preferences with\nthose of candidate items, ensuring coherent and user-aligned recommendations.\nSufficient experimental results including performance comparison, questionnaire\nvoting, and visualization cases prove that our method can not only ensure\nrecommendation performance, but also provide easy-to-understand and reasonable\nrecommendation logic.\n","authors":["Hongke Zhao","Songming Zheng","Likang Wu","Bowen Yu","Jing Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17376v3","updated":"2024-07-03T06:16:31Z","published":"2024-02-27T10:13:30Z","title":"Accelerating Diffusion Sampling with Optimized Time Steps","summary":" Diffusion probabilistic models (DPMs) have shown remarkable performance in\nhigh-resolution image synthesis, but their sampling efficiency is still to be\ndesired due to the typically large number of sampling steps. Recent\nadvancements in high-order numerical ODE solvers for DPMs have enabled the\ngeneration of high-quality images with much fewer sampling steps. While this is\na significant development, most sampling methods still employ uniform time\nsteps, which is not optimal when using a small number of steps. To address this\nissue, we propose a general framework for designing an optimization problem\nthat seeks more appropriate time steps for a specific numerical ODE solver for\nDPMs. This optimization problem aims to minimize the distance between the\nground-truth solution to the ODE and an approximate solution corresponding to\nthe numerical solver. It can be efficiently solved using the constrained trust\nregion method, taking less than $15$ seconds. Our extensive experiments on both\nunconditional and conditional sampling using pixel- and latent-space DPMs\ndemonstrate that, when combined with the state-of-the-art sampling method\nUniPC, our optimized time steps significantly improve image generation\nperformance in terms of FID scores for datasets such as CIFAR-10 and ImageNet,\ncompared to using uniform time steps.\n","authors":["Shuchen Xue","Zhaoqiang Liu","Fei Chen","Shifeng Zhang","Tianyang Hu","Enze Xie","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2402.17376v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2312.12736v2","updated":"2024-07-03T06:13:31Z","published":"2023-12-20T03:18:50Z","title":"Learning and Forgetting Unsafe Examples in Large Language Models","summary":" As the number of large language models (LLMs) released to the public grows,\nthere is a pressing need to understand the safety implications associated with\nthese models learning from third-party custom finetuning data. We explore the\nbehavior of LLMs finetuned on noisy custom data containing unsafe content,\nrepresented by datasets that contain biases, toxicity, and harmfulness, finding\nthat while aligned LLMs can readily learn this unsafe content, they also tend\nto forget it more significantly than other examples when subsequently finetuned\non safer content. Drawing inspiration from the discrepancies in forgetting, we\nintroduce the \"ForgetFilter\" algorithm, which filters unsafe data based on how\nstrong the model's forgetting signal is for that data. We demonstrate that the\nForgetFilter algorithm ensures safety in customized finetuning without\ncompromising downstream task performance, unlike sequential safety finetuning.\nForgetFilter outperforms alternative strategies like replay and moral\nself-correction in curbing LLMs' ability to assimilate unsafe content during\ncustom finetuning, e.g. 75% lower than not applying any safety measures and 62%\nlower than using self-correction in toxicity score.\n","authors":["Jiachen Zhao","Zhun Deng","David Madras","James Zou","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2312.12736v2.pdf","comment":"accepted by ICML 24"},{"id":"http://arxiv.org/abs/2407.02827v1","updated":"2024-07-03T06:10:41Z","published":"2024-07-03T06:10:41Z","title":"Convergence of Implicit Gradient Descent for Training Two-Layer\n Physics-Informed Neural Networks","summary":" Optimization algorithms is crucial in training physics-informed neural\nnetworks (PINNs), unsuitable methods may lead to poor solutions. Compared to\nthe common gradient descent algorithm, implicit gradient descent (IGD)\noutperforms it in handling some multi-scale problems. In this paper, we provide\nconvergence analysis for the implicit gradient descent for training\nover-parametrized two-layer PINNs. We first demonstrate the positive\ndefiniteness of Gram matrices for general smooth activation functions, like\nsigmoidal function, softplus function, tanh function and so on. Then the\nover-parameterization allows us to show that the randomly initialized IGD\nconverges a globally optimal solution at a linear convergence rate. Moreover,\ndue to the different training dynamics, the learning rate of IGD can be chosen\nindependent of the sample size and the least eigenvalue of the Gram matrix.\n","authors":["Xianliang Xu","Zhongyi Huang","Ye Li"],"pdf_url":"https://arxiv.org/pdf/2407.02827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03034v2","updated":"2024-07-03T06:09:14Z","published":"2023-07-06T14:56:13Z","title":"PCL-Indexability and Whittle Index for Restless Bandits with General\n Observation Models","summary":" In this paper, we consider a general observation model for restless\nmulti-armed bandit problems. The operation of the player needs to be based on\ncertain feedback mechanism that is error-prone due to resource constraints or\nenvironmental or intrinsic noises. By establishing a general probabilistic\nmodel for dynamics of feedback/observation, we formulate the problem as a\nrestless bandit with a countable belief state space starting from an arbitrary\ninitial belief (a priori information). We apply the achievable region method\nwith partial conservation law (PCL) to the infinite-state problem and analyze\nits indexability and priority index (Whittle index). Finally, we propose an\napproximation process to transform the problem into which the AG algorithm of\nNi\\~no-Mora and Bertsimas for finite-state problems can be applied to.\nNumerical experiments show that our algorithm has an excellent performance.\n","authors":["Keqin Liu","Chengzhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.03034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13516v4","updated":"2024-07-03T05:56:49Z","published":"2024-02-21T03:58:49Z","title":"ProSparse: Introducing and Enhancing Intrinsic Activation Sparsity\n within Large Language Models","summary":" Activation sparsity refers to the existence of considerable\nweakly-contributed elements among activation outputs. As a prevalent property\nof the models using the ReLU activation function, activation sparsity has been\nproven a promising paradigm to boost model inference efficiency. Nevertheless,\nmost large language models (LLMs) adopt activation functions without intrinsic\nactivation sparsity (e.g., GELU and Swish). Some recent efforts have explored\nintroducing ReLU or its variants as the substitutive activation function to\nhelp LLMs achieve activation sparsity and inference acceleration, but few can\nsimultaneously obtain high sparsity and comparable model performance. This\npaper introduces a simple and effective sparsification method named \"ProSparse\"\nto push LLMs for higher activation sparsity while maintaining comparable\nperformance. Specifically, after substituting the activation function of LLMs\nwith ReLU, ProSparse adopts progressive sparsity regularization with a factor\nsmoothly increasing along the multi-stage sine curves. This can enhance\nactivation sparsity and mitigate performance degradation by avoiding radical\nshifts in activation distributions. With ProSparse, we obtain high sparsity of\n89.32% for LLaMA2-7B, 88.80% for LLaMA2-13B, and 87.89% for end-size\nMiniCPM-1B, respectively, achieving comparable performance to their original\nSwish-activated versions. These present the most sparsely activated models\namong open-source LLaMA versions and competitive end-size models, considerably\nsurpassing ReluLLaMA-7B (66.98%) and ReluLLaMA-13B (71.56%). Our inference\nacceleration experiments further demonstrate the significant practical\nacceleration potential of LLMs with higher activation sparsity, obtaining up to\n4.52$\\times$ inference speedup.\n","authors":["Chenyang Song","Xu Han","Zhengyan Zhang","Shengding Hu","Xiyu Shi","Kuai Li","Chen Chen","Zhiyuan Liu","Guangli Li","Tao Yang","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.13516v4.pdf","comment":"19 pages, 4 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.02825v1","updated":"2024-07-03T05:51:57Z","published":"2024-07-03T05:51:57Z","title":"Representation learning with CGAN for casual inference","summary":" Conditional Generative Adversarial Nets (CGAN) is often used to improve\nconditional image generation performance. However, there is little research on\nRepresentation learning with CGAN for causal inference. This paper proposes a\nnew method for finding representation learning functions by adopting the\nadversarial idea. We apply the pattern of CGAN and theoretically emonstrate the\nfeasibility of finding a suitable representation function in the context of two\ndistributions being balanced. The theoretical result shows that when two\ndistributions are balanced, the ideal representation function can be found and\nthus can be used to further research.\n","authors":["Zhaotian Weng","Jianbo Hong","Lan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.02825v1.pdf","comment":"Proceedings of the 3rd International Conference on Signal Processing\n and Machine Learning"},{"id":"http://arxiv.org/abs/2405.18881v2","updated":"2024-07-03T05:45:45Z","published":"2024-05-29T08:39:39Z","title":"Tuning-Free Alignment of Diffusion Models with Direct Noise Optimization","summary":" In this work, we focus on the alignment problem of diffusion models with a\ncontinuous reward function, which represents specific objectives for downstream\ntasks, such as improving human preference. The central goal of the alignment\nproblem is to adjust the distribution learned by diffusion models such that the\ngenerated samples maximize the target reward function. We propose a novel\nalignment approach, named Direct Noise Optimization (DNO), that optimizes the\ninjected noise during the sampling process of diffusion models. By design, DNO\nis tuning-free and prompt-agnostic, as the alignment occurs in an online\nfashion during generation. We rigorously study the theoretical properties of\nDNO and also propose variants to deal with non-differentiable reward functions.\nFurthermore, we identify that naive implementation of DNO occasionally suffers\nfrom the out-of-distribution reward hacking problem, where optimized samples\nhave high rewards but are no longer in the support of the pretrained\ndistribution. To remedy this issue, we leverage classical high-dimensional\nstatistics theory and propose to augment the DNO loss with certain probability\nregularization. We conduct extensive experiments on several popular reward\nfunctions trained on human feedback data and demonstrate that the proposed DNO\napproach achieves state-of-the-art reward scores as well as high image quality,\nall within a reasonable time budget for generation.\n","authors":["Zhiwei Tang","Jiangweizhi Peng","Jiasheng Tang","Mingyi Hong","Fan Wang","Tsung-Hui Chang"],"pdf_url":"https://arxiv.org/pdf/2405.18881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02821v1","updated":"2024-07-03T05:45:09Z","published":"2024-07-03T05:45:09Z","title":"Effect of a Process Mining based Pre-processing Step in Prediction of\n the Critical Health Outcomes","summary":" Predicting critical health outcomes such as patient mortality and hospital\nreadmission is essential for improving survivability. However, healthcare\ndatasets have many concurrences that create complexities, leading to poor\npredictions. Consequently, pre-processing the data is crucial to improve its\nquality. In this study, we use an existing pre-processing algorithm,\nconcatenation, to improve data quality by decreasing the complexity of\ndatasets. Sixteen healthcare datasets were extracted from two databases - MIMIC\nIII and University of Illinois Hospital - converted to the event logs, they\nwere then fed into the concatenation algorithm. The pre-processed event logs\nwere then fed to the Split Miner (SM) algorithm to produce a process model.\nProcess model quality was evaluated before and after concatenation using the\nfollowing metrics: fitness, precision, F-Measure, and complexity. The\npre-processed event logs were also used as inputs to the Decay Replay Mining\n(DREAM) algorithm to predict critical outcomes. We compared predicted results\nbefore and after applying the concatenation algorithm using Area Under the\nCurve (AUC) and Confidence Intervals (CI). Results indicated that the\nconcatenation algorithm improved the quality of the process models and\npredictions of the critical health outcomes.\n","authors":["Negin Ashrafi","Armin Abdollahi","Greg Placencia","Maryam Pishgar"],"pdf_url":"https://arxiv.org/pdf/2407.02821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02819v1","updated":"2024-07-03T05:40:41Z","published":"2024-07-03T05:40:41Z","title":"Efficient Training of Language Models with Compact and Consistent Next\n Token Distributions","summary":" Maximizing the likelihood of the next token is an established, statistically\nsound objective for pre-training language models. In this paper we show that we\ncan train better models faster by pre-aggregating the corpus with a collapsed\n$n$-gram distribution. Previous studies have proposed corpus-level $n$-gram\nstatistics as a regularizer; however, the construction and querying of such\n$n$-grams, if done naively, prove to be costly and significantly impede\ntraining speed, thereby limiting their application in modern large language\nmodel pre-training.\n We introduce an alternative compact representation of the next token\ndistribution that, in expectation, aligns with the complete $n$-gram\ndistribution while markedly reducing variance across mini-batches compared to\nthe standard next-token loss. Empirically, we demonstrate that both the\n$n$-gram regularized model and our approximation yield substantial improvements\nin model quality and convergence rate compared to existing methods.\nFurthermore, our approximation facilitates scalability of gains to larger\ndatasets and models compared to the straightforward $n$-gram regularization\nmethod.\n","authors":["Ashutosh Sathe","Sunita Sarawagi"],"pdf_url":"https://arxiv.org/pdf/2407.02819v1.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.01012v3","updated":"2024-07-03T05:36:00Z","published":"2024-07-01T06:52:34Z","title":"Swish-T : Enhancing Swish Activation with Tanh Bias for Improved Neural\n Network Performance","summary":" We propose the Swish-T family, an enhancement of the existing non-monotonic\nactivation function Swish. Swish-T is defined by adding a Tanh bias to the\noriginal Swish function. This modification creates a family of Swish-T\nvariants, each designed to excel in different tasks, showcasing specific\nadvantages depending on the application context. The Tanh bias allows for\nbroader acceptance of negative values during initial training stages, offering\na smoother non-monotonic curve than the original Swish. We ultimately propose\nthe Swish-T$_{\\textbf{C}}$ function, while Swish-T and Swish-T$_{\\textbf{B}}$,\nbyproducts of Swish-T$_{\\textbf{C}}$, also demonstrate satisfactory\nperformance. Furthermore, our ablation study shows that using\nSwish-T$_{\\textbf{C}}$ as a non-parametric function can still achieve high\nperformance. The superiority of the Swish-T family has been empirically\ndemonstrated across various models and benchmark datasets, including MNIST,\nFashion MNIST, SVHN, CIFAR-10, and CIFAR-100. The code is publicly available at\nhttps://github.com/ictseoyoungmin/Swish-T-pytorch.\n","authors":["Youngmin Seo","Jinha Kim","Unsang Park"],"pdf_url":"https://arxiv.org/pdf/2407.01012v3.pdf","comment":"11 pages, 6 figures Revised the derivative of the sigmoid function\n from 1-sigmoid to sigmoid(1-sigmoid) for correctness.Updated related\n equations in Section 3.2. Conclusions to Conclusion in Section 6"},{"id":"http://arxiv.org/abs/2405.13383v2","updated":"2024-07-03T05:27:45Z","published":"2024-05-22T06:33:48Z","title":"Gradient Projection For Continual Parameter-Efficient Tuning","summary":" Parameter-efficient tunings (PETs) have demonstrated impressive performance\nand promising perspectives in training large models, while they are still\nconfronted with a common problem: the trade-off between learning new content\nand protecting old knowledge, e.g., zero-shot generalization ability, and\ncross-modal hallucination. In this paper, we reformulate Adapter, LoRA,\nPrefix-tuning, and Prompt-tuning from the perspective of gradient projection,\nand firstly propose a unified framework called Parameter Efficient Gradient\nProjection (PEGP). We introduce orthogonal gradient projection into different\nPET paradigms and theoretically demonstrate that the orthogonal condition for\nthe gradient can effectively resist forgetting even for large-scale models. It\ntherefore modifies the gradient towards the direction that has less impact on\nthe old feature space, with less extra memory space and training time. We\nextensively evaluate our method with different backbones, including ViT and\nCLIP, on diverse datasets, and experiments comprehensively demonstrate its\nefficiency in reducing forgetting in class, online class, domain, task, and\nmulti-modality continual settings. The project page is available at\nhttps://dmcv-ecnu-pegp.github.io/.\n","authors":["Jingyang Qiao","Zhizhong Zhang","Xin Tan","Yanyun Qu","Wensheng Zhang","Zhi Han","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2405.13383v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02813v1","updated":"2024-07-03T05:17:26Z","published":"2024-07-03T05:17:26Z","title":"Data Overfitting for On-Device Super-Resolution with Dynamic Algorithm\n and Compiler Co-Design","summary":" Deep neural networks (DNNs) are frequently employed in a variety of computer\nvision applications. Nowadays, an emerging trend in the current video\ndistribution system is to take advantage of DNN's overfitting properties to\nperform video resolution upscaling. By splitting videos into chunks and\napplying a super-resolution (SR) model to overfit each chunk, this scheme of SR\nmodels plus video chunks is able to replace traditional video transmission to\nenhance video quality and transmission efficiency. However, many models and\nchunks are needed to guarantee high performance, which leads to tremendous\noverhead on model switching and memory footprints at the user end. To resolve\nsuch problems, we propose a Dynamic Deep neural network assisted by a\nContent-Aware data processing pipeline to reduce the model number down to one\n(Dy-DCA), which helps promote performance while conserving computational\nresources. Additionally, to achieve real acceleration on the user end, we\ndesigned a framework that optimizes dynamic features (e.g., dynamic shapes,\nsizes, and control flow) in Dy-DCA to enable a series of compilation\noptimizations, including fused code generation, static execution planning, etc.\nBy employing such techniques, our method achieves better PSNR and real-time\nperformance (33 FPS) on an off-the-shelf mobile phone. Meanwhile, assisted by\nour compilation optimization, we achieve a 1.7$\\times$ speedup while saving up\nto 1.61$\\times$ memory consumption. Code available in\nhttps://github.com/coulsonlee/Dy-DCA-ECCV2024.\n","authors":["Gen Li","Zhihao Shu","Jie Ji","Minghai Qin","Fatemeh Afghah","Wei Niu","Xiaolong Ma"],"pdf_url":"https://arxiv.org/pdf/2407.02813v1.pdf","comment":"ECCV2024"},{"id":"http://arxiv.org/abs/2407.02811v1","updated":"2024-07-03T05:13:28Z","published":"2024-07-03T05:13:28Z","title":"SPLITZ: Certifiable Robustness via Split Lipschitz Randomized Smoothing","summary":" Certifiable robustness gives the guarantee that small perturbations around an\ninput to a classifier will not change the prediction. There are two approaches\nto provide certifiable robustness to adversarial examples: a) explicitly\ntraining classifiers with small Lipschitz constants, and b) Randomized\nsmoothing, which adds random noise to the input to create a smooth classifier.\nWe propose \\textit{SPLITZ}, a practical and novel approach which leverages the\nsynergistic benefits of both the above ideas into a single framework. Our main\nidea is to \\textit{split} a classifier into two halves, constrain the Lipschitz\nconstant of the first half, and smooth the second half via randomization.\nMotivation for \\textit{SPLITZ} comes from the observation that many standard\ndeep networks exhibit heterogeneity in Lipschitz constants across layers.\n\\textit{SPLITZ} can exploit this heterogeneity while inheriting the scalability\nof randomized smoothing. We present a principled approach to train\n\\textit{SPLITZ} and provide theoretical analysis to derive certified robustness\nguarantees during inference. We present a comprehensive comparison of\nrobustness-accuracy tradeoffs and show that \\textit{SPLITZ} consistently\nimproves upon existing state-of-the-art approaches on MNIST and CIFAR-10\ndatasets. For instance, with $\\ell_2$ norm perturbation budget of\n\\textbf{$\\epsilon=1$}, \\textit{SPLITZ} achieves $\\textbf{43.2\\%}$ top-1 test\naccuracy on CIFAR-10 dataset compared to state-of-art top-1 test accuracy\n$\\textbf{39.8\\%}\n","authors":["Meiyu Zhong","Ravi Tandon"],"pdf_url":"https://arxiv.org/pdf/2407.02811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03898v2","updated":"2024-07-03T04:57:41Z","published":"2024-02-06T11:10:35Z","title":"DistiLLM: Towards Streamlined Distillation for Large Language Models","summary":" Knowledge distillation (KD) is widely used for compressing a teacher model to\na smaller student model, reducing its inference cost and memory footprint while\npreserving model capabilities. However, current KD methods for auto-regressive\nsequence models (e.g., large language models) suffer from missing a\nstandardized objective function. Moreover, the recent use of student-generated\noutputs to address training-inference mismatches has significantly escalated\ncomputational costs. To tackle these issues, we introduce DistiLLM, a more\neffective and efficient KD framework for auto-regressive language models.\nDistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence\nloss, where we unveil and leverage its theoretical properties, and (2) an\nadaptive off-policy approach designed to enhance the efficiency in utilizing\nstudent-generated outputs. Extensive experiments, including\ninstruction-following tasks, demonstrate the effectiveness of DistiLLM in\nbuilding high-performing student models while achieving up to 4.3$\\times$\nspeedup compared to recent KD methods.\n","authors":["Jongwoo Ko","Sungnyun Kim","Tianyi Chen","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2402.03898v2.pdf","comment":"ICML 2024; Code is available at https://github.com/jongwooko/distillm"},{"id":"http://arxiv.org/abs/2310.04218v5","updated":"2024-07-03T04:41:05Z","published":"2023-10-06T13:05:07Z","title":"A Fixed-Parameter Tractable Algorithm for Counting Markov Equivalence\n Classes with the same Skeleton","summary":" Causal DAGs (also known as Bayesian networks) are a popular tool for encoding\nconditional dependencies between random variables. In a causal DAG, the random\nvariables are modeled as vertices in the DAG, and it is stipulated that every\nrandom variable is independent of its ancestors conditioned on its parents. It\nis possible, however, for two different causal DAGs on the same set of random\nvariables to encode exactly the same set of conditional dependencies. Such\ncausal DAGs are said to be Markov equivalent, and equivalence classes of Markov\nequivalent DAGs are known as Markov Equivalent Classes (MECs). Beautiful\ncombinatorial characterizations of MECs have been developed in the past few\ndecades, and it is known, in particular that all DAGs in the same MEC must have\nthe same \"skeleton\" (underlying undirected graph) and v-structures (induced\nsubgraph of the form $a\\rightarrow b \\leftarrow c$).\n These combinatorial characterizations also suggest several natural\nalgorithmic questions. One of these is: given an undirected graph $G$ as input,\nhow many distinct Markov equivalence classes have the skeleton $G$? Much work\nhas been devoted in the last few years to this and other closely related\nproblems. However, to the best of our knowledge, a polynomial time algorithm\nfor the problem remains unknown.\n In this paper, we make progress towards this goal by giving a fixed parameter\ntractable algorithm for the above problem, with the parameters being the\ntreewidth and the maximum degree of the input graph $G$. The main technical\ningredient in our work is a construction we refer to as shadow, which lets us\ncreate a \"local description\" of long-range constraints imposed by the\ncombinatorial characterizations of MECs.\n","authors":["Vidya Sagar Sharma"],"pdf_url":"https://arxiv.org/pdf/2310.04218v5.pdf","comment":"75 pages, 2 Figures"},{"id":"http://arxiv.org/abs/2406.00734v2","updated":"2024-07-03T04:30:01Z","published":"2024-06-02T12:51:48Z","title":"GLADformer: A Mixed Perspective for Graph-level Anomaly Detection","summary":" Graph-Level Anomaly Detection (GLAD) aims to distinguish anomalous graphs\nwithin a graph dataset. However, current methods are constrained by their\nreceptive fields, struggling to learn global features within the graphs.\nMoreover, most contemporary methods are based on spatial domain and lack\nexploration of spectral characteristics. In this paper, we propose a\nmulti-perspective hybrid graph-level anomaly detector namely GLADformer,\nconsisting of two key modules. Specifically, we first design a Graph\nTransformer module with global spectrum enhancement, which ensures balanced and\nresilient parameter distributions by fusing global features and spectral\ndistribution characteristics. Furthermore, to uncover local anomalous\nattributes, we customize a band-pass spectral GNN message passing module that\nfurther enhances the model's generalization capability. Through comprehensive\nexperiments on ten real-world datasets from multiple domains, we validate the\neffectiveness and robustness of GLADformer. This demonstrates that GLADformer\noutperforms current state-of-the-art models in graph-level anomaly detection,\nparticularly in effectively capturing global anomaly representations and\nspectral characteristics.\n","authors":["Fan Xu","Nan Wang","Hao Wu","Xuezhi Wen","Dalin Zhang","Siyang Lu","Binyong Li","Wei Gong","Hai Wan","Xibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.00734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00916v2","updated":"2024-07-03T03:42:46Z","published":"2024-07-01T02:42:27Z","title":"Learnability in Online Kernel Selection with Memory Constraint via\n Data-dependent Regret Analysis","summary":" Online kernel selection is a fundamental problem of online kernel methods.In\nthis paper,we study online kernel selection with memory constraint in which the\nmemory of kernel selection and online prediction procedures is limited to a\nfixed budget. An essential question is what is the intrinsic relationship among\nonline learnability, memory constraint, and data complexity? To answer the\nquestion,it is necessary to show the trade-offs between regret and memory\nconstraint.Previous work gives a worst-case lower bound depending on the data\nsize,and shows learning is impossible within a small memory constraint.In\ncontrast, we present distinct results by offering data-dependent upper bounds\nthat rely on two data complexities:kernel alignment and the cumulative losses\nof competitive hypothesis.We propose an algorithmic framework giving\ndata-dependent upper bounds for two types of loss functions.For the hinge loss\nfunction,our algorithm achieves an expected upper bound depending on kernel\nalignment.For smooth loss functions,our algorithm achieves a high-probability\nupper bound depending on the cumulative losses of competitive hypothesis.We\nalso prove a matching lower bound for smooth loss functions.Our results show\nthat if the two data complexities are sub-linear,then learning is possible\nwithin a small memory constraint.Our algorithmic framework depends on a new\nbuffer maintaining framework and a reduction from online kernel selection to\nprediction with expert advice. Finally,we empirically verify the prediction\nperformance of our algorithms on benchmark datasets.\n","authors":["Junfan Li","Shizhong Liao"],"pdf_url":"https://arxiv.org/pdf/2407.00916v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00522v5","updated":"2024-07-03T03:23:24Z","published":"2024-02-01T11:43:13Z","title":"Understanding the Expressive Power and Mechanisms of Transformer for\n Sequence Modeling","summary":" We conduct a systematic study of the approximation properties of Transformer\nfor sequence modeling with long, sparse and complicated memory. We investigate\nthe mechanisms through which different components of Transformer, such as the\ndot-product self-attention, positional encoding and feed-forward layer, affect\nits expressive power, and we study their combined effects through establishing\nexplicit approximation rates. Our study reveals the roles of critical\nparameters in the Transformer, such as the number of layers and the number of\nattention heads. These theoretical insights are validated experimentally and\noffer natural suggestions for alternative architectures.\n","authors":["Mingze Wang","Weinan E"],"pdf_url":"https://arxiv.org/pdf/2402.00522v5.pdf","comment":"70 pages"},{"id":"http://arxiv.org/abs/2407.02779v1","updated":"2024-07-03T03:10:25Z","published":"2024-07-03T03:10:25Z","title":"Croppable Knowledge Graph Embedding","summary":" Knowledge Graph Embedding (KGE) is a common method for Knowledge Graphs (KGs)\nto serve various artificial intelligence tasks. The suitable dimensions of the\nembeddings depend on the storage and computing conditions of the specific\napplication scenarios. Once a new dimension is required, a new KGE model needs\nto be trained from scratch, which greatly increases the training cost and\nlimits the efficiency and flexibility of KGE in serving various scenarios. In\nthis work, we propose a novel KGE training framework MED, through which we\ncould train once to get a croppable KGE model applicable to multiple scenarios\nwith different dimensional requirements, sub-models of the required dimensions\ncan be cropped out of it and used directly without any additional training. In\nMED, we propose a mutual learning mechanism to improve the low-dimensional\nsub-models performance and make the high-dimensional sub-models retain the\ncapacity that low-dimensional sub-models have, an evolutionary improvement\nmechanism to promote the high-dimensional sub-models to master the knowledge\nthat the low-dimensional sub-models can not learn, and a dynamic loss weight to\nbalance the multiple losses adaptively. Experiments on 3 KGE models over 4\nstandard KG completion datasets, 3 real application scenarios over a real-world\nlarge-scale KG, and the experiments of extending MED to the language model BERT\nshow the effectiveness, high efficiency, and flexible extensibility of MED.\n","authors":["Yushan Zhu","Wen Zhang","Zhiqiang Liu","Mingyang Chen","Lei Liang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2407.02779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02778v1","updated":"2024-07-03T03:10:24Z","published":"2024-07-03T03:10:24Z","title":"Foster Adaptivity and Balance in Learning with Noisy Labels","summary":" Label noise is ubiquitous in real-world scenarios, posing a practical\nchallenge to supervised models due to its effect in hurting the generalization\nperformance of deep neural networks. Existing methods primarily employ the\nsample selection paradigm and usually rely on dataset-dependent prior knowledge\n(\\eg, a pre-defined threshold) to cope with label noise, inevitably degrading\nthe adaptivity. Moreover, existing methods tend to neglect the class balance in\nselecting samples, leading to biased model performance. To this end, we propose\na simple yet effective approach named \\textbf{SED} to deal with label noise in\na \\textbf{S}elf-adaptiv\\textbf{E} and class-balance\\textbf{D} manner.\nSpecifically, we first design a novel sample selection strategy to empower\nself-adaptivity and class balance when identifying clean and noisy data. A\nmean-teacher model is then employed to correct labels of noisy samples.\nSubsequently, we propose a self-adaptive and class-balanced sample re-weighting\nmechanism to assign different weights to detected noisy samples. Finally, we\nadditionally employ consistency regularization on selected clean samples to\nimprove model generalization performance. Extensive experimental results on\nsynthetic and real-world datasets demonstrate the effectiveness and superiority\nof our proposed method. The source code has been made available at\nhttps://github.com/NUST-Machine-Intelligence-Laboratory/SED.\n","authors":["Mengmeng Sheng","Zeren Sun","Tao Chen","Shuchao Pang","Yucheng Wang","Yazhou Yao"],"pdf_url":"https://arxiv.org/pdf/2407.02778v1.pdf","comment":"accepted by the European Conference on Computer Vision (ECCV), 2024"},{"id":"http://arxiv.org/abs/2407.02775v1","updated":"2024-07-03T03:03:30Z","published":"2024-07-03T03:03:30Z","title":"MLKD-BERT: Multi-level Knowledge Distillation for Pre-trained Language\n Models","summary":" Knowledge distillation is an effective technique for pre-trained language\nmodel compression. Although existing knowledge distillation methods perform\nwell for the most typical model BERT, they could be further improved in two\naspects: the relation-level knowledge could be further explored to improve\nmodel performance; and the setting of student attention head number could be\nmore flexible to decrease inference time. Therefore, we are motivated to\npropose a novel knowledge distillation method MLKD-BERT to distill multi-level\nknowledge in teacher-student framework. Extensive experiments on GLUE benchmark\nand extractive question answering tasks demonstrate that our method outperforms\nstate-of-the-art knowledge distillation methods on BERT. In addition, MLKD-BERT\ncan flexibly set student attention head number, allowing for substantial\ninference time decrease with little performance drop.\n","authors":["Ying Zhang","Ziheng Yang","Shufan Ji"],"pdf_url":"https://arxiv.org/pdf/2407.02775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02772v1","updated":"2024-07-03T03:01:43Z","published":"2024-07-03T03:01:43Z","title":"Automatic gradient descent with generalized Newton's method","summary":" We propose the generalized Newton's method (GeN) -- a Hessian-informed\napproach that applies to any optimizer such as SGD and Adam, and covers the\nNewton-Raphson method as a sub-case. Our method automatically and dynamically\nselects the learning rate that accelerates the convergence, without the\nintensive tuning of the learning rate scheduler. In practice, out method is\neasily implementable, since it only requires additional forward passes with\nalmost zero computational overhead (in terms of training time and memory cost),\nif the overhead is amortized over many iterations. We present extensive\nexperiments on language and vision tasks (e.g. GPT and ResNet) to showcase that\nGeN optimizers match the state-of-the-art performance, which was achieved with\ncarefully tuned learning rate schedulers. Code to be released at\n\\url{https://github.com/ShiyunXu/AutoGeN}.\n","authors":["Zhiqi Bu","Shiyun Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15623v3","updated":"2024-07-03T02:59:32Z","published":"2023-11-27T08:38:42Z","title":"Injecting linguistic knowledge into BERT for Dialogue State Tracking","summary":" Dialogue State Tracking (DST) models often employ intricate neural network\narchitectures, necessitating substantial training data, and their inference\nprocess lacks transparency. This paper proposes a method that extracts\nlinguistic knowledge via an unsupervised framework and subsequently utilizes\nthis knowledge to augment BERT's performance and interpretability in DST tasks.\nThe knowledge extraction procedure is computationally economical and does not\nrequire annotations or additional training data. The injection of the extracted\nknowledge can be achieved by the addition of simple neural modules. We employ\nthe Convex Polytopic Model (CPM) as a feature extraction tool for DST tasks and\nillustrate that the acquired features correlate with syntactic and semantic\npatterns in the dialogues. This correlation facilitates a comprehensive\nunderstanding of the linguistic features influencing the DST model's\ndecision-making process. We benchmark this framework on various DST tasks and\nobserve a notable improvement in accuracy.\n","authors":["Xiaohan Feng","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2311.15623v3.pdf","comment":"Accepted for publication at IEEE Access"},{"id":"http://arxiv.org/abs/2407.02770v1","updated":"2024-07-03T02:57:40Z","published":"2024-07-03T02:57:40Z","title":"Large language models, physics-based modeling, experimental\n measurements: the trinity of data-scarce learning of polymer properties","summary":" Large language models (LLMs) bear promise as a fast and accurate material\nmodeling paradigm for evaluation, analysis, and design. Their vast number of\ntrainable parameters necessitates a wealth of data to achieve accuracy and\nmitigate overfitting. However, experimental measurements are often limited and\ncostly to obtain in sufficient quantities for finetuning. To this end, we\npresent a physics-based training pipeline that tackles the pathology of data\nscarcity. The core enabler is a physics-based modeling framework that generates\na multitude of synthetic data to align the LLM to a physically consistent\ninitial state before finetuning. Our framework features a two-phase training\nstrategy: (1) utilizing the large-in-amount while less accurate synthetic data\nfor supervised pretraining, and (2) finetuning the phase-1 model with limited\nexperimental data. We empirically demonstrate that supervised pretraining is\nvital to obtaining accurate finetuned LLMs, via the lens of learning polymer\nflammability metrics where cone calorimeter data is sparse.\n","authors":["Ning Liu","Siavash Jafarzadeh","Brian Y. Lattimer","Shuna Ni","Jim Lua","Yue Yu"],"pdf_url":"https://arxiv.org/pdf/2407.02770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13544v2","updated":"2024-07-03T02:53:47Z","published":"2024-06-19T13:30:17Z","title":"One Fits All: Learning Fair Graph Neural Networks for Various Sensitive\n Attributes","summary":" Recent studies have highlighted fairness issues in Graph Neural Networks\n(GNNs), where they produce discriminatory predictions against specific\nprotected groups categorized by sensitive attributes such as race and age.\nWhile various efforts to enhance GNN fairness have made significant progress,\nthese approaches are often tailored to specific sensitive attributes.\nConsequently, they necessitate retraining the model from scratch to accommodate\nchanges in the sensitive attribute requirement, resulting in high computational\ncosts. To gain deeper insights into this issue, we approach the graph fairness\nproblem from a causal modeling perspective, where we identify the confounding\neffect induced by the sensitive attribute as the underlying reason. Motivated\nby this observation, we formulate the fairness problem in graphs from an\ninvariant learning perspective, which aims to learn invariant representations\nacross environments. Accordingly, we propose a graph fairness framework based\non invariant learning, namely FairINV, which enables the training of fair GNNs\nto accommodate various sensitive attributes within a single training session.\nSpecifically, FairINV incorporates sensitive attribute partition and trains\nfair GNNs by eliminating spurious correlations between the label and various\nsensitive attributes. Experimental results on several real-world datasets\ndemonstrate that FairINV significantly outperforms state-of-the-art fairness\napproaches, underscoring its effectiveness. Our code is available via:\nhttps://github.com/ZzoomD/FairINV/.\n","authors":["Yuchang Zhu","Jintang Li","Yatao Bian","Zibin Zheng","Liang Chen"],"pdf_url":"https://arxiv.org/pdf/2406.13544v2.pdf","comment":"Accepted by KDD 2024"},{"id":"http://arxiv.org/abs/2407.02762v1","updated":"2024-07-03T02:40:39Z","published":"2024-07-03T02:40:39Z","title":"SF-GNN: Self Filter for Message Lossless Propagation in Deep Graph\n Neural Network","summary":" Graph Neural Network (GNN), with the main idea of encoding graph structure\ninformation of graphs by propagation and aggregation, has developed rapidly. It\nachieved excellent performance in representation learning of multiple types of\ngraphs such as homogeneous graphs, heterogeneous graphs, and more complex\ngraphs like knowledge graphs. However, merely stacking GNN layers may not\nimprove the model's performance and can even be detrimental. For the phenomenon\nof performance degradation in deep GNNs, we propose a new perspective. Unlike\nthe popular explanations of over-smoothing or over-squashing, we think the\nissue arises from the interference of low-quality node representations during\nmessage propagation. We introduce a simple and general method, SF-GNN, to\naddress this problem. In SF-GNN, we define two representations for each node,\none is the node representation that represents the feature of the node itself,\nand the other is the message representation specifically for propagating\nmessages to neighbor nodes. A self-filter module evaluates the quality of the\nnode representation and decides whether to integrate it into the message\npropagation based on this quality assessment. Experiments on node\nclassification tasks for both homogeneous and heterogeneous graphs, as well as\nlink prediction tasks on knowledge graphs, demonstrate that our method can be\napplied to various GNN models and outperforms state-of-the-art baseline methods\nin addressing deep GNN degradation.\n","authors":["Yushan Zhu","Wen Zhang","Yajing Xu","Zhen Yao","Mingyang Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2407.02762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04673v4","updated":"2024-07-03T02:38:03Z","published":"2023-10-07T03:17:59Z","title":"LauraGPT: Listen, Attend, Understand, and Regenerate Audio with GPT","summary":" Generative Pre-trained Transformer (GPT) models have achieved remarkable\nperformance on various natural language processing tasks, and have shown great\npotential as backbones for audio-and-text large language models (LLMs).\nPrevious mainstream audio-and-text LLMs use discrete audio tokens to represent\nboth input and output audio; however, they suffer from performance degradation\non tasks such as automatic speech recognition, speech-to-text translation, and\nspeech enhancement over models using continuous speech features. In this paper,\nwe propose LauraGPT, a novel unified audio-and-text GPT-based LLM for audio\nrecognition, understanding, and generation. LauraGPT is a versatile LLM that\ncan process both audio and text inputs and generate outputs in either\nmodalities. We propose a novel data representation that combines continuous and\ndiscrete features for audio: LauraGPT encodes input audio into continuous\nrepresentations using an audio encoder and generates output audio from discrete\ncodec codes. We propose a one-step codec vocoder to overcome the prediction\nchallenge caused by the multimodal distribution of codec tokens. We fine-tune\nLauraGPT using supervised multi-task learning. Extensive experiments show that\nLauraGPT consistently achieves comparable to superior performance compared to\nstrong baselines on a wide range of audio tasks related to content, semantics,\nparalinguistics, and audio-signal analysis, such as automatic speech\nrecognition, speech-to-text translation, text-to-speech synthesis, speech\nenhancement, automated audio captioning, speech emotion recognition, and spoken\nlanguage understanding.\n","authors":["Zhihao Du","Jiaming Wang","Qian Chen","Yunfei Chu","Zhifu Gao","Zerui Li","Kai Hu","Xiaohuan Zhou","Jin Xu","Ziyang Ma","Wen Wang","Siqi Zheng","Chang Zhou","Zhijie Yan","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.04673v4.pdf","comment":"10 pages, work in progress"},{"id":"http://arxiv.org/abs/2407.02759v1","updated":"2024-07-03T02:33:20Z","published":"2024-07-03T02:33:20Z","title":"Multi-Scenario Combination Based on Multi-Agent Reinforcement Learning\n to Optimize the Advertising Recommendation System","summary":" This paper explores multi-scenario optimization on large platforms using\nmulti-agent reinforcement learning (MARL). We address this by treating\nscenarios like search, recommendation, and advertising as a cooperative,\npartially observable multi-agent decision problem. We introduce the Multi-Agent\nRecurrent Deterministic Policy Gradient (MARDPG) algorithm, which aligns\ndifferent scenarios under a shared objective and allows for strategy\ncommunication to boost overall performance. Our results show marked\nimprovements in metrics such as click-through rate (CTR), conversion rate, and\ntotal sales, confirming our method's efficacy in practical settings.\n","authors":["Yang Zhao","Chang Zhou","Jin Cao","Yi Zhao","Shaobo Liu","Chiyu Cheng","Xingchen Li"],"pdf_url":"https://arxiv.org/pdf/2407.02759v1.pdf","comment":"Accepted by 2024 5th International Conference on Artificial\n Intelligence and Electromechanical Automation IEEE (ISBN: 979-8-3503-6617-4)"},{"id":"http://arxiv.org/abs/2407.02758v1","updated":"2024-07-03T02:23:33Z","published":"2024-07-03T02:23:33Z","title":"Differential Encoding for Improved Representation Learning over Graphs","summary":" Combining the message-passing paradigm with the global attention mechanism\nhas emerged as an effective framework for learning over graphs. The\nmessage-passing paradigm and the global attention mechanism fundamentally\ngenerate node embeddings based on information aggregated from a node's local\nneighborhood or from the whole graph. The most basic and commonly used\naggregation approach is to take the sum of information from a node's local\nneighbourhood or from the whole graph. However, it is unknown if the dominant\ninformation is from a node itself or from the node's neighbours (or the rest of\nthe graph nodes). Therefore, there exists information lost at each layer of\nembedding generation, and this information lost could be accumulated and become\nmore serious when more layers are used in the model. In this paper, we present\na differential encoding method to address the issue of information lost. The\nidea of our method is to encode the differential representation between the\ninformation from a node's neighbours (or the rest of the graph nodes) and that\nfrom the node itself. The obtained differential encoding is then combined with\nthe original aggregated local or global representation to generate the updated\nnode embedding. By integrating differential encodings, the representational\nability of generated node embeddings is improved. The differential encoding\nmethod is empirically evaluated on different graph tasks on seven benchmark\ndatasets. The results show that it is a general method that improves the\nmessage-passing update and the global attention update, advancing the\nstate-of-the-art performance for graph representation learning on these\ndatasets.\n","authors":["Haimin Zhang","Jiahao Xia","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2407.02758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00115v2","updated":"2024-07-03T02:17:26Z","published":"2024-06-27T14:00:05Z","title":"Instance Temperature Knowledge Distillation","summary":" Knowledge distillation (KD) enhances the performance of a student network by\nallowing it to learn the knowledge transferred from a teacher network\nincrementally. Existing methods dynamically adjust the temperature to enable\nthe student network to adapt to the varying learning difficulties at different\nlearning stages of KD. KD is a continuous process, but when adjusting the\ntemperature, these methods consider only the immediate benefits of the\noperation in the current learning phase and fail to take into account its\nfuture returns. To address this issue, we formulate the adjustment of\ntemperature as a sequential decision-making task and propose a method based on\nreinforcement learning, termed RLKD. Importantly, we design a novel state\nrepresentation to enable the agent to make more informed action (i.e. instance\ntemperature adjustment). To handle the problem of delayed rewards in our method\ndue to the KD setting, we explore an instance reward calibration approach. In\naddition,we devise an efficient exploration strategy that enables the agent to\nlearn valuable instance temperature adjustment policy more efficiently. Our\nframework can serve as a plug-and-play technique to be inserted into various KD\nmethods easily, and we validate its effectiveness on both image classification\nand object detection tasks. Our project is at\nhttps://www.zayx.me/ITKD.github.io/.\n","authors":["Zhengbo Zhang","Yuxi Zhou","Jia Gong","Jun Liu","Zhigang Tu"],"pdf_url":"https://arxiv.org/pdf/2407.00115v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13937v5","updated":"2024-07-03T02:06:07Z","published":"2024-05-22T19:10:24Z","title":"DyGPrompt: Learning Feature and Time Prompts on Dynamic Graphs","summary":" Dynamic graphs are pervasive in the real world, modeling dynamic relations\nbetween objects across various fields. For dynamic graph modeling, dynamic\ngraph neural networks (DGNNs) have emerged as a mainstream technique, which are\ngenerally pre-trained on the link prediction task, leaving a significant gap\nfrom the objectives of downstream tasks such as node classification. To bridge\nthe gap, prompt-based learning has gained traction on graphs. However, existing\nefforts focus on static graphs, neglecting the evolution of dynamic graphs. In\nthis paper, we propose DyGPrompt, a novel pre-training and prompting framework\nfor dynamic graph modeling. First, we design dual prompts to address the gap in\nboth task objectives and dynamic variations across pre-training and downstream\ntasks. Second, we recognize that node and time features mutually characterize\neach other, and propose dual condition-nets to model the evolving node-time\npatterns in downstream tasks. Finally, we thoroughly evaluate and analyze\nDyGPrompt through extensive experiments on three public datasets.\n","authors":["Xingtong Yu","Zhenghao Liu","Yuan Fang","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.13937v5.pdf","comment":"Under review"}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.03188v1","updated":"2024-07-03T15:12:36Z","published":"2024-07-03T15:12:36Z","title":"MuDiT & MuSiT: Alignment with Colloquial Expression in\n Description-to-Song Generation","summary":" Amid the rising intersection of generative AI and human artistic processes,\nthis study probes the critical yet less-explored terrain of alignment in\nhuman-centric automatic song composition. We propose a novel task of Colloquial\nDescription-to-Song Generation, which focuses on aligning the generated content\nwith colloquial human expressions. This task is aimed at bridging the gap\nbetween colloquial language understanding and auditory expression within an AI\nmodel, with the ultimate goal of creating songs that accurately satisfy human\nauditory expectations and structurally align with musical norms. Current\ndatasets are limited due to their narrow descriptive scope, semantic gaps and\ninaccuracies. To overcome data scarcity in this domain, we present the Caichong\nMusic Dataset (CaiMD). CaiMD is manually annotated by both professional\nmusicians and amateurs, offering diverse perspectives and a comprehensive\nunderstanding of colloquial descriptions. Unlike existing datasets pre-set with\nexpert annotations or auto-generated ones with inherent biases, CaiMD caters\nmore sufficiently to our purpose of aligning AI-generated music with widespread\nuser-desired results. Moreover, we propose an innovative single-stage framework\ncalled MuDiT/MuSiT for enabling effective human-machine alignment in song\ncreation. This framework not only achieves cross-modal comprehension between\ncolloquial language and auditory music perceptions but also ensures generated\nsongs align with user-desired results. MuDiT/MuSiT employs one DiT/SiT model\nfor end-to-end generation of musical components like melody, harmony, rhythm,\nvocals, and instrumentation. The approach ensures harmonious sonic cohesiveness\namongst all generated musical components, facilitating better resonance with\nhuman auditory expectations.\n","authors":["Zihao Wang","Haoxuan Liu","Jiaxing Yu","Tao Zhang","Yan Liu","Kejun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03188v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.03178v1","updated":"2024-07-03T14:58:40Z","published":"2024-07-03T14:58:40Z","title":"Relating CNN-Transformer Fusion Network for Change Detection","summary":" While deep learning, particularly convolutional neural networks (CNNs), has\nrevolutionized remote sensing (RS) change detection (CD), existing approaches\noften miss crucial features due to neglecting global context and incomplete\nchange learning. Additionally, transformer networks struggle with low-level\ndetails. RCTNet addresses these limitations by introducing \\textbf{(1)} an\nearly fusion backbone to exploit both spatial and temporal features early on,\n\\textbf{(2)} a Cross-Stage Aggregation (CSA) module for enhanced temporal\nrepresentation, \\textbf{(3)} a Multi-Scale Feature Fusion (MSF) module for\nenriched feature extraction in the decoder, and \\textbf{(4)} an Efficient\nSelf-deciphering Attention (ESA) module utilizing transformers to capture\nglobal information and fine-grained details for accurate change detection.\nExtensive experiments demonstrate RCTNet's clear superiority over traditional\nRS image CD methods, showing significant improvement and an optimal balance\nbetween accuracy and computational cost.\n","authors":["Yuhao Gao","Gensheng Pei","Mengmeng Sheng","Zeren Sun","Tao Chen","Yazhou Yao"],"pdf_url":"https://arxiv.org/pdf/2407.03178v1.pdf","comment":"accepted by IEEE Conference on Multimedia Expo"},{"id":"http://arxiv.org/abs/2407.03107v1","updated":"2024-07-03T13:46:20Z","published":"2024-07-03T13:46:20Z","title":"Design of a UE5-based digital twin platform","summary":" Aiming at the current mainstream 3D scene engine learning and building cost\nis too high, this thesis proposes a digital twin platform design program based\non Unreal Engine 5 (UE5). It aims to provide a universal platform construction\ndesign process to effectively reduce the learning cost of large-scale scene\nconstruction. Taking an actual project of a unit as an example, the overall\ncycle work of platform building is explained, and the digital twin and data\nvisualization technologies and applications based on UE5 are analyzed. By\nsummarizing the project implementation into a process approach, the\nstandardization and operability of the process pathway is improved.\n","authors":["Shaoqiu Lyu","Muzhi Wang","Sunrui Zhang","Shengzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03104v1","updated":"2024-07-03T13:41:44Z","published":"2024-07-03T13:41:44Z","title":"KeyVideoLLM: Towards Large-scale Video Keyframe Selection","summary":" Recently, with the rise of web videos, managing and understanding large-scale\nvideo datasets has become increasingly important. Video Large Language Models\n(VideoLLMs) have emerged in recent years due to their strong video\nunderstanding capabilities. However, training and inference processes for\nVideoLLMs demand vast amounts of data, presenting significant challenges to\ndata management, particularly regarding efficiency, robustness, and\neffectiveness. In this work, we present KeyVideoLLM, a text-video frame\nsimilarity-based keyframe selection method designed to manage VideoLLM data\nefficiently, robustly, and effectively. Specifically, KeyVideoLLM achieves a\nremarkable data compression rate of up to 60.9 times, substantially lowering\ndisk space requirements, which proves its high efficiency. Additionally, it\nmaintains a 100% selection success rate across all video formats and scales,\nenhances processing speed by up to 200 times compared to existing keyframe\nselection methods, and does not require hyperparameter tuning. Beyond its\noutstanding efficiency and robustness, KeyVideoLLM further improves model\nperformance in video question-answering tasks during both training and\ninference stages. Notably, it consistently achieved the state-of-the-art (SoTA)\nexperimental results on diverse datasets.\n","authors":["Hao Liang","Jiapeng Li","Tianyi Bai","Chong Chen","Conghui He","Bin Cui","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03027v1","updated":"2024-07-03T11:36:58Z","published":"2024-07-03T11:36:58Z","title":"Differentially Processed Optimized Collaborative Rich Text Editor","summary":" A collaborative real-time text editor is an application that allows multiple\nusers to edit a document simultaneously and merge their contributions\nautomatically. It can be made collaborative by implementing a conflict\nresolution algorithm either on the client side (in peer-to-peer collaboration)\nor on the server side (when using web sockets and a central server to monitor\nstate changes). Although web sockets are ideal for real-time text editors,\nusing multiple collaborative editors on one connection can create problems.\nThis is because a single web connection cannot monitor which user is\ncollaborating on which application state, leading to unnecessary network\nqueries and data being delivered to the wrong state. To address this issue, the\ncurrent solution is to open multiple web socket connections, with one web\nsocket per collaboration application. However, this can add significant\noverhead proportional to the number of apps utilized. In this study, we\ndemonstrate an algorithm that enables using a single web socket for multiple\ncollaborative applications in a collaborative editor. Our method involves\nmodifying the socket's code to track which application's shared state is being\nworked on and by whom. This allows for the simultaneous collaboration of\nmultiple states in real-time, with infinite users, without opening a different\nsocket for each application. Our optimized editor showed an efficiency\nimprovement of over 96% in access time duration. This approach can be\nimplemented in other collaborative editors and web applications with similar\narchitecture to improve performance and eliminate issues arising from network\noverload.\n","authors":["Nishtha Jatana","Mansehej Singh","Charu Gupta","Geetika Dhand","Shaily Malik","Pankaj Dadheech","Nagender Aneja","Sandhya Aneja"],"pdf_url":"https://arxiv.org/pdf/2407.03027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00621v2","updated":"2024-07-03T09:53:45Z","published":"2024-03-31T09:20:30Z","title":"Multimodal Pretraining, Adaptation, and Generation for Recommendation: A\n Survey","summary":" Personalized recommendation serves as a ubiquitous channel for users to\ndiscover information tailored to their interests. However, traditional\nrecommendation models primarily rely on unique IDs and categorical features for\nuser-item matching, potentially overlooking the nuanced essence of raw item\ncontents across multiple modalities such as text, image, audio, and video. This\nunderutilization of multimodal data poses a limitation to recommender systems,\nespecially in multimedia services like news, music, and short-video platforms.\nThe recent advancements in large multimodal models offer new opportunities and\nchallenges in developing content-aware recommender systems. This survey seeks\nto provide a comprehensive exploration of the latest advancements and future\ntrajectories in multimodal pretraining, adaptation, and generation techniques,\nas well as their applications in enhancing recommender systems. Furthermore, we\ndiscuss current open challenges and opportunities for future research in this\ndynamic domain. We believe that this survey, alongside the curated resources,\nwill provide valuable insights to inspire further advancements in this evolving\nlandscape.\n","authors":["Qijiong Liu","Jieming Zhu","Yanting Yang","Quanyu Dai","Zhaocheng Du","Xiao-Ming Wu","Zhou Zhao","Rui Zhang","Zhenhua Dong"],"pdf_url":"https://arxiv.org/pdf/2404.00621v2.pdf","comment":"Accepted by KDD 2024. See our tutorial materials at\n https://mmrec.github.io"},{"id":"http://arxiv.org/abs/2407.02867v1","updated":"2024-07-03T07:31:33Z","published":"2024-07-03T07:31:33Z","title":"Contrast then Memorize: Semantic Neighbor Retrieval-Enhanced Inductive\n Multimodal Knowledge Graph Completion","summary":" A large number of studies have emerged for Multimodal Knowledge Graph\nCompletion (MKGC) to predict the missing links in MKGs. However, fewer studies\nhave been proposed to study the inductive MKGC (IMKGC) involving emerging\nentities unseen during training. Existing inductive approaches focus on\nlearning textual entity representations, which neglect rich semantic\ninformation in visual modality. Moreover, they focus on aggregating structural\nneighbors from existing KGs, which of emerging entities are usually limited.\nHowever, the semantic neighbors are decoupled from the topology linkage and\nusually imply the true target entity. In this paper, we propose the IMKGC task\nand a semantic neighbor retrieval-enhanced IMKGC framework CMR, where the\ncontrast brings the helpful semantic neighbors close, and then the memorize\nsupports semantic neighbor retrieval to enhance inference. Specifically, we\nfirst propose a unified cross-modal contrastive learning to simultaneously\ncapture the textual-visual and textual-textual correlations of query-entity\npairs in a unified representation space. The contrastive learning increases the\nsimilarity of positive query-entity pairs, therefore making the representations\nof helpful semantic neighbors close. Then, we explicitly memorize the knowledge\nrepresentations to support the semantic neighbor retrieval. At test time, we\nretrieve the nearest semantic neighbors and interpolate them to the\nquery-entity similarity distribution to augment the final prediction. Extensive\nexperiments validate the effectiveness of CMR on three inductive MKGC datasets.\nCodes are available at https://github.com/OreOZhao/CMR.\n","authors":["Yu Zhao","Ying Zhang","Baohang Zhou","Xinying Qian","Kehui Song","Xiangrui Cai"],"pdf_url":"https://arxiv.org/pdf/2407.02867v1.pdf","comment":"Accepted by SIGIR 2024"},{"id":"http://arxiv.org/abs/2407.02798v1","updated":"2024-07-03T03:58:22Z","published":"2024-07-03T03:58:22Z","title":"Game-Based Discovery: Harnessing Mini-Games within Primary Games for\n Scientific Data Collection and Problem Solving","summary":" In the popular video game Batman: Arkham Knight, produced by Rocksteady\nStudios and released in 2015, the primary protagonist of the game is Batman, a\nvigilante dressed as a bat, fighting crime from the shadows in the fictitious\ncity of Gotham. The game involves a real-world player who takes up the role of\nBatman to solve a peculiar side mission wherein they have to reconstruct the\nclean DNA sequence of a human and separate it from mutant DNA to manufacture an\nantidote to cure the villain. Although this is undoubtedly a fascinating part\nof the game, one that was absent in previous Batman games, it showcases an\ninteresting notion of using mini-games embedded within primary games to achieve\na particular real-world research objective. Although the DNA data used in this\ncase was not real, there are multiple such instances in video games where\nmini-games have been used for an underlying motive besides entertainment. Based\non popular case studies incorporating a similar method, this study\ncharacterizes the methodology of designing mini-games within primary games for\nresearch purposes into a descriptive framework, highlighting the process's\nadvantages and limitations. It is concluded that these mini-games not only\nfacilitate a deeper understanding of complex scientific concepts but also\naccelerate data processing and analysis by leveraging crowd-sourced human\nintuition and pattern recognition capabilities. This paper argues for\nstrategically incorporating miniaturized, gamified elements into established\nvideo games that are mainly intended for recreational purposes.\n","authors":["Abhishek Phadke","Mamta Yadav","Stanislav Ustymenko"],"pdf_url":"https://arxiv.org/pdf/2407.02798v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.02411v2","updated":"2024-07-03T03:48:18Z","published":"2024-07-02T16:34:14Z","title":"Video Watermarking: Safeguarding Your Video from (Unauthorized)\n Annotations by Video-based LLMs","summary":" The advent of video-based Large Language Models (LLMs) has significantly\nenhanced video understanding. However, it has also raised some safety concerns\nregarding data protection, as videos can be more easily annotated, even without\nauthorization. This paper introduces Video Watermarking, a novel technique to\nprotect videos from unauthorized annotations by such video-based LLMs,\nespecially concerning the video content and description, in response to\nspecific queries. By imperceptibly embedding watermarks into key video frames\nwith multi-modal flow-based losses, our method preserves the viewing experience\nwhile preventing misuse by video-based LLMs. Extensive experiments show that\nVideo Watermarking significantly reduces the comprehensibility of videos with\nvarious video-based LLMs, demonstrating both stealth and robustness. In\nessence, our method provides a solution for securing video content, ensuring\nits integrity and confidentiality in the face of evolving video-based LLMs\ntechnologies.\n","authors":["Jinmin Li","Kuofeng Gao","Yang Bai","Jingyun Zhang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2407.02411v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2403.13507"},{"id":"http://arxiv.org/abs/2407.02773v1","updated":"2024-07-03T03:02:59Z","published":"2024-07-03T03:02:59Z","title":"OpenVNA: A Framework for Analyzing the Behavior of Multimodal Language\n Understanding System under Noisy Scenarios","summary":" We present OpenVNA, an open-source framework designed for analyzing the\nbehavior of multimodal language understanding systems under noisy conditions.\nOpenVNA serves as an intuitive toolkit tailored for researchers, facilitating\nconvenience batch-level robustness evaluation and on-the-fly instance-level\ndemonstration. It primarily features a benchmark Python library for assessing\nglobal model robustness, offering high flexibility and extensibility, thereby\nenabling customization with user-defined noise types and models. Additionally,\na GUI-based interface has been developed to intuitively analyze local model\nbehavior. In this paper, we delineate the design principles and utilization of\nthe created library and GUI-based web platform. Currently, OpenVNA is publicly\naccessible at \\url{https://github.com/thuiar/OpenVNA}, with a demonstration\nvideo available at \\url{https://youtu.be/0Z9cW7RGct4}.\n","authors":["Ziqi Yuan","Baozheng Zhang","Hua Xu","Zhiyun Liang","Kai Gao"],"pdf_url":"https://arxiv.org/pdf/2407.02773v1.pdf","comment":"10 pages, 4 figures, to be published in ACL 2024 System Demonstration\n Track"},{"id":"http://arxiv.org/abs/2402.18844v2","updated":"2024-07-03T02:52:33Z","published":"2024-02-29T04:30:39Z","title":"Deep learning for 3D human pose estimation and mesh recovery: A survey","summary":" 3D human pose estimation and mesh recovery have attracted widespread research\ninterest in many areas, such as computer vision, autonomous driving, and\nrobotics. Deep learning on 3D human pose estimation and mesh recovery has\nrecently thrived, with numerous methods proposed to address different problems\nin this area. In this paper, to stimulate future research, we present a\ncomprehensive review of recent progress over the past five years in deep\nlearning methods for this area by delving into over 200 references. To the best\nof our knowledge, this survey is arguably the first to comprehensively cover\ndeep learning methods for 3D human pose estimation, including both\nsingle-person and multi-person approaches, as well as human mesh recovery,\nencompassing methods based on explicit models and implicit representations. We\nalso present comparative results on several publicly available datasets,\ntogether with insightful observations and inspiring future research directions.\nA regularly updated project page can be found at\nhttps://github.com/liuyangme/SOTA-3DHPE-HMR.\n","authors":["Yang Liu","Changzhen Qiu","Zhiyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.18844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04673v4","updated":"2024-07-03T02:38:03Z","published":"2023-10-07T03:17:59Z","title":"LauraGPT: Listen, Attend, Understand, and Regenerate Audio with GPT","summary":" Generative Pre-trained Transformer (GPT) models have achieved remarkable\nperformance on various natural language processing tasks, and have shown great\npotential as backbones for audio-and-text large language models (LLMs).\nPrevious mainstream audio-and-text LLMs use discrete audio tokens to represent\nboth input and output audio; however, they suffer from performance degradation\non tasks such as automatic speech recognition, speech-to-text translation, and\nspeech enhancement over models using continuous speech features. In this paper,\nwe propose LauraGPT, a novel unified audio-and-text GPT-based LLM for audio\nrecognition, understanding, and generation. LauraGPT is a versatile LLM that\ncan process both audio and text inputs and generate outputs in either\nmodalities. We propose a novel data representation that combines continuous and\ndiscrete features for audio: LauraGPT encodes input audio into continuous\nrepresentations using an audio encoder and generates output audio from discrete\ncodec codes. We propose a one-step codec vocoder to overcome the prediction\nchallenge caused by the multimodal distribution of codec tokens. We fine-tune\nLauraGPT using supervised multi-task learning. Extensive experiments show that\nLauraGPT consistently achieves comparable to superior performance compared to\nstrong baselines on a wide range of audio tasks related to content, semantics,\nparalinguistics, and audio-signal analysis, such as automatic speech\nrecognition, speech-to-text translation, text-to-speech synthesis, speech\nenhancement, automated audio captioning, speech emotion recognition, and spoken\nlanguage understanding.\n","authors":["Zhihao Du","Jiaming Wang","Qian Chen","Yunfei Chu","Zhifu Gao","Zerui Li","Kai Hu","Xiaohuan Zhou","Jin Xu","Ziyang Ma","Wen Wang","Siqi Zheng","Chang Zhou","Zhijie Yan","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.04673v4.pdf","comment":"10 pages, work in progress"},{"id":"http://arxiv.org/abs/2407.03426v1","updated":"2024-07-03T18:09:25Z","published":"2024-07-03T18:09:25Z","title":"Multi-Task Decision-Making for Multi-User 360 Video Processing over\n Wireless Networks","summary":" We study a multi-task decision-making problem for 360 video processing in a\nwireless multi-user virtual reality (VR) system that includes an edge computing\nunit (ECU) to deliver 360 videos to VR users and offer computing assistance for\ndecoding/rendering of video frames. However, this comes at the expense of\nincreased data volume and required bandwidth. To balance this trade-off, we\nformulate a constrained quality of experience (QoE) maximization problem in\nwhich the rebuffering time and quality variation between video frames are\nbounded by user and video requirements. To solve the formulated multi-user QoE\nmaximization, we leverage deep reinforcement learning (DRL) for multi-task rate\nadaptation and computation distribution (MTRC). The proposed MTRC approach does\nnot rely on any predefined assumption about the environment and relies on video\nplayback statistics (i.e., past throughput, decoding time, transmission time,\netc.), video information, and the resulting performance to adjust the video\nbitrate and computation distribution. We train MTRC with real-world wireless\nnetwork traces and 360 video datasets to obtain evaluation results in terms of\nthe average QoE, peak signal-to-noise ratio (PSNR), rebuffering time, and\nquality variation. Our results indicate that the MTRC improves the users' QoE\ncompared to state-of-the-art rate adaptation algorithm. Specifically, we show a\n5.97 dB to 6.44 dB improvement in PSNR, a 1.66X to 4.23X improvement in\nrebuffering time, and a 4.21 dB to 4.35 dB improvement in quality variation.\n","authors":["Babak Badnava","Jacob Chakareski","Morteza Hashemi"],"pdf_url":"https://arxiv.org/pdf/2407.03426v1.pdf","comment":"2024 IEEE International Conference on Multimedia Information\n Processing and Retrieval (MIPR)"}]},"2024-07-05T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2407.04694v1","updated":"2024-07-05T17:57:02Z","published":"2024-07-05T17:57:02Z","title":"Me, Myself, and AI: The Situational Awareness Dataset (SAD) for LLMs","summary":" AI assistants such as ChatGPT are trained to respond to users by saying, \"I\nam a large language model\". This raises questions. Do such models know that\nthey are LLMs and reliably act on this knowledge? Are they aware of their\ncurrent circumstances, such as being deployed to the public? We refer to a\nmodel's knowledge of itself and its circumstances as situational awareness. To\nquantify situational awareness in LLMs, we introduce a range of behavioral\ntests, based on question answering and instruction following. These tests form\nthe $\\textbf{Situational Awareness Dataset (SAD)}$, a benchmark comprising 7\ntask categories and over 13,000 questions. The benchmark tests numerous\nabilities, including the capacity of LLMs to (i) recognize their own generated\ntext, (ii) predict their own behavior, (iii) determine whether a prompt is from\ninternal evaluation or real-world deployment, and (iv) follow instructions that\ndepend on self-knowledge.\n We evaluate 16 LLMs on SAD, including both base (pretrained) and chat models.\nWhile all models perform better than chance, even the highest-scoring model\n(Claude 3 Opus) is far from a human baseline on certain tasks. We also observe\nthat performance on SAD is only partially predicted by metrics of general\nknowledge (e.g. MMLU). Chat models, which are finetuned to serve as AI\nassistants, outperform their corresponding base models on SAD but not on\ngeneral knowledge tasks. The purpose of SAD is to facilitate scientific\nunderstanding of situational awareness in LLMs by breaking it down into\nquantitative abilities. Situational awareness is important because it enhances\na model's capacity for autonomous planning and action. While this has potential\nbenefits for automation, it also introduces novel risks related to AI safety\nand control. Code and latest results available at\nhttps://situational-awareness-dataset.org .\n","authors":["Rudolf Laine","Bilal Chughtai","Jan Betley","Kaivalya Hariharan","Jeremy Scheurer","Mikita Balesni","Marius Hobbhahn","Alexander Meinke","Owain Evans"],"pdf_url":"https://arxiv.org/pdf/2407.04694v1.pdf","comment":"11 page main body, 98 page appendix, 58 figures"},{"id":"http://arxiv.org/abs/2407.04693v1","updated":"2024-07-05T17:56:38Z","published":"2024-07-05T17:56:38Z","title":"ANAH-v2: Scaling Analytical Hallucination Annotation of Large Language\n Models","summary":" Large language models (LLMs) exhibit hallucinations in long-form\nquestion-answering tasks across various domains and wide applications. Current\nhallucination detection and mitigation datasets are limited in domains and\nsizes, which struggle to scale due to prohibitive labor costs and insufficient\nreliability of existing hallucination annotators. To facilitate the scalable\noversight of LLM hallucinations, this paper introduces an iterative\nself-training framework that simultaneously and progressively scales up the\nhallucination annotation dataset and improves the accuracy of the hallucination\nannotator. Based on the Expectation Maximization (EM) algorithm, in each\niteration, the framework first applies a hallucination annotation pipeline to\nannotate a scaled dataset and then trains a more accurate hallucination\nannotator on the dataset. This new hallucination annotator is adopted in the\nhallucination annotation pipeline used for the next iteration. Extensive\nexperimental results demonstrate that the finally obtained hallucination\nannotator with only 7B parameters surpasses the performance of GPT-4 and\nobtains new state-of-the-art hallucination detection results on HaluEval and\nHalluQA by zero-shot inference. Such an annotator can not only evaluate the\nhallucination levels of various LLMs on the large-scale dataset but also help\nto mitigate the hallucination of LLMs generations, with the Natural Language\nInference (NLI) metric increasing from 25% to 37% on HaluEval.\n","authors":["Yuzhe Gu","Ziwei Ji","Wenwei Zhang","Chengqi Lyu","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2407.04693v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2407.04690v1","updated":"2024-07-05T17:53:03Z","published":"2024-07-05T17:53:03Z","title":"Missed Causes and Ambiguous Effects: Counterfactuals Pose Challenges for\n Interpreting Neural Networks","summary":" Interpretability research takes counterfactual theories of causality for\ngranted. Most causal methods rely on counterfactual interventions to inputs or\nthe activations of particular model components, followed by observations of the\nchange in models' output logits or behaviors. While this yields more faithful\nevidence than correlational methods, counterfactuals nonetheless have key\nproblems that bias our findings in specific and predictable ways. Specifically,\n(i) counterfactual theories do not effectively capture multiple independently\nsufficient causes of the same effect, which leads us to miss certain causes\nentirely; and (ii) counterfactual dependencies in neural networks are generally\nnot transitive, which complicates methods for extracting and interpreting\ncausal graphs from neural networks. We discuss the implications of these\nchallenges for interpretability researchers and propose concrete suggestions\nfor future work.\n","authors":["Aaron Mueller"],"pdf_url":"https://arxiv.org/pdf/2407.04690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04681v1","updated":"2024-07-05T17:43:30Z","published":"2024-07-05T17:43:30Z","title":"Rethinking Visual Prompting for Multimodal Large Language Models with\n External Knowledge","summary":" In recent years, multimodal large language models (MLLMs) have made\nsignificant strides by training on vast high-quality image-text datasets,\nenabling them to generally understand images well. However, the inherent\ndifficulty in explicitly conveying fine-grained or spatially dense information\nin text, such as masks, poses a challenge for MLLMs, limiting their ability to\nanswer questions requiring an understanding of detailed or localized visual\nelements. Drawing inspiration from the Retrieval-Augmented Generation (RAG)\nconcept, this paper proposes a new visual prompt approach to integrate\nfine-grained external knowledge, gleaned from specialized vision models (e.g.,\ninstance segmentation/OCR models), into MLLMs. This is a promising yet\nunderexplored direction for enhancing MLLMs' performance. Our approach diverges\nfrom concurrent works, which transform external knowledge into additional text\nprompts, necessitating the model to indirectly learn the correspondence between\nvisual content and text coordinates. Instead, we propose embedding fine-grained\nknowledge information directly into a spatial embedding map as a visual prompt.\nThis design can be effortlessly incorporated into various MLLMs, such as LLaVA\nand Mipha, considerably improving their visual understanding performance.\nThrough rigorous experiments, we demonstrate that our method can enhance MLLM\nperformance across nine benchmarks, amplifying their fine-grained context-aware\ncapabilities.\n","authors":["Yuanze Lin","Yunsheng Li","Dongdong Chen","Weijian Xu","Ronald Clark","Philip Torr","Lu Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.04681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04680v1","updated":"2024-07-05T17:43:16Z","published":"2024-07-05T17:43:16Z","title":"Lost in Translation: The Algorithmic Gap Between LMs and the Brain","summary":" Language Models (LMs) have achieved impressive performance on various\nlinguistic tasks, but their relationship to human language processing in the\nbrain remains unclear. This paper examines the gaps and overlaps between LMs\nand the brain at different levels of analysis, emphasizing the importance of\nlooking beyond input-output behavior to examine and compare the internal\nprocesses of these systems. We discuss how insights from neuroscience, such as\nsparsity, modularity, internal states, and interactive learning, can inform the\ndevelopment of more biologically plausible language models. Furthermore, we\nexplore the role of scaling laws in bridging the gap between LMs and human\ncognition, highlighting the need for efficiency constraints analogous to those\nin biological systems. By developing LMs that more closely mimic brain\nfunction, we aim to advance both artificial intelligence and our understanding\nof human cognition.\n","authors":["Tommaso Tosato","Pascal Jr Tikeng Notsawo","Saskia Helbling","Irina Rish","Guillaume Dumas"],"pdf_url":"https://arxiv.org/pdf/2407.04680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13684v3","updated":"2024-07-05T17:19:52Z","published":"2023-05-23T04:44:26Z","title":"mPLM-Sim: Better Cross-Lingual Similarity and Transfer in Multilingual\n Pretrained Language Models","summary":" Recent multilingual pretrained language models (mPLMs) have been shown to\nencode strong language-specific signals, which are not explicitly provided\nduring pretraining. It remains an open question whether it is feasible to\nemploy mPLMs to measure language similarity, and subsequently use the\nsimilarity results to select source languages for boosting cross-lingual\ntransfer. To investigate this, we propose mPLMSim, a language similarity\nmeasure that induces the similarities across languages from mPLMs using\nmulti-parallel corpora. Our study shows that mPLM-Sim exhibits moderately high\ncorrelations with linguistic similarity measures, such as lexicostatistics,\ngenealogical language family, and geographical sprachbund. We also conduct a\ncase study on languages with low correlation and observe that mPLM-Sim yields\nmore accurate similarity results. Additionally, we find that similarity results\nvary across different mPLMs and different layers within an mPLM. We further\ninvestigate whether mPLMSim is effective for zero-shot cross-lingual transfer\nby conducting experiments on both low-level syntactic tasks and high-level\nsemantic tasks. The experimental results demonstrate that mPLM-Sim is capable\nof selecting better source languages than linguistic measures, resulting in a\n1%-2% improvement in zero-shot cross-lingual transfer performance.\n","authors":["Peiqin Lin","Chengzhi Hu","Zheyu Zhang","André F. T. Martins","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2305.13684v3.pdf","comment":"EACL 2024 Findings"},{"id":"http://arxiv.org/abs/2407.04652v1","updated":"2024-07-05T17:07:58Z","published":"2024-07-05T17:07:58Z","title":"Pretraining End-to-End Keyword Search with Automatically Discovered\n Acoustic Units","summary":" End-to-end (E2E) keyword search (KWS) has emerged as an alternative and\ncomplimentary approach to conventional keyword search which depends on the\noutput of automatic speech recognition (ASR) systems. While E2E methods greatly\nsimplify the KWS pipeline, they generally have worse performance than their\nASR-based counterparts, which can benefit from pretraining with untranscribed\ndata. In this work, we propose a method for pretraining E2E KWS systems with\nuntranscribed data, which involves using acoustic unit discovery (AUD) to\nobtain discrete units for untranscribed data and then learning to locate\nsequences of such units in the speech. We conduct experiments across languages\nand AUD systems: we show that finetuning such a model significantly outperforms\na model trained from scratch, and the performance improvements are generally\ncorrelated with the quality of the AUD system used for pretraining.\n","authors":["Bolaji Yusuf","Jan \"Honza\" Černocký","Murat Saraçlar"],"pdf_url":"https://arxiv.org/pdf/2407.04652v1.pdf","comment":"Interspeech 2024. KWS code at:\n https://github.com/bolajiy/golden-retriever; AUD code at\n https://github.com/beer-asr/beer/tree/master/recipes/hshmm"},{"id":"http://arxiv.org/abs/2407.04641v1","updated":"2024-07-05T16:52:55Z","published":"2024-07-05T16:52:55Z","title":"Speculative Speech Recognition by Audio-Prefixed Low-Rank Adaptation of\n Language Models","summary":" This paper explores speculative speech recognition (SSR), where we empower\nconventional automatic speech recognition (ASR) with speculation capabilities,\nallowing the recognizer to run ahead of audio. We introduce a metric for\nmeasuring SSR performance and we propose a model which does SSR by combining a\nRNN-Transducer-based ASR system with an audio-prefixed language model (LM). The\nASR system transcribes ongoing audio and feeds the resulting transcripts, along\nwith an audio-dependent prefix, to the LM, which speculates likely completions\nfor the transcriptions. We experiment with a variety of ASR datasets on which\nshow the efficacy our method and the feasibility of SSR as a method of reducing\nASR latency.\n","authors":["Bolaji Yusuf","Murali Karthick Baskar","Andrew Rosenberg","Bhuvana Ramabhadran"],"pdf_url":"https://arxiv.org/pdf/2407.04641v1.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2406.14657v2","updated":"2024-07-05T16:51:15Z","published":"2024-06-20T18:22:59Z","title":"OpenDebateEvidence: A Massive-Scale Argument Mining and Summarization\n Dataset","summary":" We introduce OpenDebateEvidence, a comprehensive dataset for argument mining\nand summarization sourced from the American Competitive Debate community. This\ndataset includes over 3.5 million documents with rich metadata, making it one\nof the most extensive collections of debate evidence. OpenDebateEvidence\ncaptures the complexity of arguments in high school and college debates,\nproviding valuable resources for training and evaluation. Our extensive\nexperiments demonstrate the efficacy of fine-tuning state-of-the-art large\nlanguage models for argumentative abstractive summarization across various\nmethods, models, and datasets. By providing this comprehensive resource, we aim\nto advance computational argumentation and support practical applications for\ndebaters, educators, and researchers. OpenDebateEvidence is publicly available\nto support further research and innovation in computational argumentation.\nAccess it here: https://huggingface.co/datasets/Yusuf5/OpenCaselist\n","authors":["Allen Roush","Yusuf Shabazz","Arvind Balaji","Peter Zhang","Stefano Mezza","Markus Zhang","Sanjay Basu","Sriram Vishwanath","Mehdi Fatemi","Ravid Shwartz-Ziv"],"pdf_url":"https://arxiv.org/pdf/2406.14657v2.pdf","comment":"Accepted for Publication to ARGMIN 2024 at ACL2024"},{"id":"http://arxiv.org/abs/2407.04629v1","updated":"2024-07-05T16:38:23Z","published":"2024-07-05T16:38:23Z","title":"Entity Decomposition with Filtering: A Zero-Shot Clinical Named Entity\n Recognition Framework","summary":" Clinical named entity recognition (NER) aims to retrieve important entities\nwithin clinical narratives. Recent works have demonstrated that large language\nmodels (LLMs) can achieve strong performance in this task. While previous works\nfocus on proprietary LLMs, we investigate how open NER LLMs, trained\nspecifically for entity recognition, perform in clinical NER. In this paper, we\naim to improve them through a novel framework, entity decomposition with\nfiltering, or EDF. Our key idea is to decompose the entity recognition task\ninto several retrievals of sub-entity types. We also introduce a filtering\nmechanism to remove incorrect entities. Our experimental results demonstrate\nthe efficacy of our framework across all metrics, models, datasets, and entity\ntypes. Our analysis reveals that entity decomposition can recognize previously\nmissed entities with substantial improvement. We further provide a\ncomprehensive evaluation of our framework and an in-depth error analysis to\npave future works.\n","authors":["Reza Averly","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2407.04629v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.06725v3","updated":"2024-07-05T16:24:29Z","published":"2024-03-11T13:44:43Z","title":"Improving Low-Resource Knowledge Tracing Tasks by Supervised\n Pre-training and Importance Mechanism Fine-tuning","summary":" Knowledge tracing (KT) aims to estimate student's knowledge mastery based on\ntheir historical interactions. Recently, the deep learning based KT (DLKT)\napproaches have achieved impressive performance in the KT task. These DLKT\nmodels heavily rely on the large number of available student interactions.\nHowever, due to various reasons such as budget constraints and privacy\nconcerns, observed interactions are very limited in many real-world scenarios,\na.k.a, low-resource KT datasets. Directly training a DLKT model on a\nlow-resource KT dataset may lead to overfitting and it is difficult to choose\nthe appropriate deep neural architecture. Therefore, in this paper, we propose\na low-resource KT framework called LoReKT to address above challenges. Inspired\nby the prevalent \"pre-training and fine-tuning\" paradigm, we aim to learn\ntransferable parameters and representations from rich-resource KT datasets\nduring the pre-training stage and subsequently facilitate effective adaptation\nto low-resource KT datasets. Specifically, we simplify existing sophisticated\nDLKT model architectures with purely a stack of transformer decoders. We design\nan encoding mechanism to incorporate student interactions from multiple KT data\nsources and develop an importance mechanism to prioritize updating parameters\nwith high importance while constraining less important ones during the\nfine-tuning stage. We evaluate LoReKT on six public KT datasets and\nexperimental results demonstrate the superiority of our approach in terms of\nAUC and Accuracy. To encourage reproducible research, we make our data and code\npublicly available at https://anonymous.4open.science/r/LoReKT-C619.\n","authors":["Hengyuan Zhang","Zitao Liu","Shuyan Huang","Chenming Shang","Bojun Zhan","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.06725v3.pdf","comment":"29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.04620v1","updated":"2024-07-05T16:23:20Z","published":"2024-07-05T16:23:20Z","title":"Learning to (Learn at Test Time): RNNs with Expressive Hidden States","summary":" Self-attention performs well in long context but has quadratic complexity.\nExisting RNN layers have linear complexity, but their performance in long\ncontext is limited by the expressive power of their hidden state. We propose a\nnew class of sequence modeling layers with linear complexity and an expressive\nhidden state. The key idea is to make the hidden state a machine learning model\nitself, and the update rule a step of self-supervised learning. Since the\nhidden state is updated by training even on test sequences, our layers are\ncalled Test-Time Training (TTT) layers. We consider two instantiations:\nTTT-Linear and TTT-MLP, whose hidden state is a linear model and a two-layer\nMLP respectively. We evaluate our instantiations at the scale of 125M to 1.3B\nparameters, comparing with a strong Transformer and Mamba, a modern RNN. Both\nTTT-Linear and TTT-MLP match or exceed the baselines. Similar to Transformer,\nthey can keep reducing perplexity by conditioning on more tokens, while Mamba\ncannot after 16k context. With preliminary systems optimization, TTT-Linear is\nalready faster than Transformer at 8k context and matches Mamba in wall-clock\ntime. TTT-MLP still faces challenges in memory I/O, but shows larger potential\nin long context, pointing to a promising direction for future research.\n","authors":["Yu Sun","Xinhao Li","Karan Dalal","Jiarui Xu","Arjun Vikram","Genghan Zhang","Yann Dubois","Xinlei Chen","Xiaolong Wang","Sanmi Koyejo","Tatsunori Hashimoto","Carlos Guestrin"],"pdf_url":"https://arxiv.org/pdf/2407.04620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04615v1","updated":"2024-07-05T16:11:03Z","published":"2024-07-05T16:11:03Z","title":"ARM: Efficient Guided Decoding with Autoregressive Reward Models","summary":" Language models trained on large amounts of data require careful tuning to be\nsafely deployed in real world. We revisit the guided decoding paradigm, where\nthe goal is to augment the logits of the base language model using the scores\nfrom a task-specific reward model. We propose a simple but efficient\nparameterization of the autoregressive reward model enabling fast and effective\nguided decoding. On detoxification and sentiment control tasks, we show that\nour efficient parameterization performs on par with RAD, a strong but less\nefficient guided decoding approach.\n","authors":["Sergey Troshin","Vlad Niculae","Antske Fokkens"],"pdf_url":"https://arxiv.org/pdf/2407.04615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04601v1","updated":"2024-07-05T15:50:47Z","published":"2024-07-05T15:50:47Z","title":"Written Term Detection Improves Spoken Term Detection","summary":" End-to-end (E2E) approaches to keyword search (KWS) are considerably simpler\nin terms of training and indexing complexity when compared to approaches which\nuse the output of automatic speech recognition (ASR) systems. This\nsimplification however has drawbacks due to the loss of modularity. In\nparticular, where ASR-based KWS systems can benefit from external unpaired text\nvia a language model, current formulations of E2E KWS systems have no such\nmechanism. Therefore, in this paper, we propose a multitask training objective\nwhich allows unpaired text to be integrated into E2E KWS without complicating\nindexing and search. In addition to training an E2E KWS model to retrieve text\nqueries from spoken documents, we jointly train it to retrieve text queries\nfrom masked written documents. We show empirically that this approach can\neffectively leverage unpaired text for KWS, with significant improvements in\nsearch performance across a wide variety of languages. We conduct analysis\nwhich indicates that these improvements are achieved because the proposed\nmethod improves document representations for words in the unpaired text.\nFinally, we show that the proposed method can be used for domain adaptation in\nsettings where in-domain paired data is scarce or nonexistent.\n","authors":["Bolaji Yusuf","Murat Saraçlar"],"pdf_url":"https://arxiv.org/pdf/2407.04601v1.pdf","comment":"IEEE/ACM Transactions on Audio, Speech and Language Processing\n (TASLP), 2024. Code at https://github.com/bolajiy/golden-retriever"},{"id":"http://arxiv.org/abs/2407.03255v2","updated":"2024-07-05T15:45:19Z","published":"2024-07-03T16:36:26Z","title":"How Similar Are Elected Politicians and Their Constituents? Quantitative\n Evidence From Online Social Networks","summary":" How similar are politicians to those who vote for them? This is a critical\nquestion at the heart of democratic representation and particularly relevant at\ntimes when political dissatisfaction and populism are on the rise. To answer\nthis question we compare the online discourse of elected politicians and their\nconstituents. We collect a two and a half years (September 2020 - February\n2023) constituency-level dataset for USA and UK that includes: (i) the Twitter\ntimelines (5.6 Million tweets) of elected political representatives (595 UK\nMembers of Parliament and 433 USA Representatives), (ii) the Nextdoor posts\n(21.8 Million posts) of the constituency (98.4% USA and 91.5% UK\nconstituencies). We find that elected politicians tend to be equally similar to\ntheir constituents in terms of content and style regardless of whether a\nconstituency elects a right or left-wing politician. The size of the electoral\nvictory and the level of income of a constituency shows a nuanced picture. The\nnarrower the electoral victory, the more similar the style and the more\ndissimilar the content is. The lower the income of a constituency, the more\nsimilar the content is. In terms of style, poorer constituencies tend to have a\nmore similar sentiment and more dissimilar psychological text traits (i.e.\nmeasured with LIWC categories).\n","authors":["Waleed Iqbal","Gareth Tyson","Ignacio Castro"],"pdf_url":"https://arxiv.org/pdf/2407.03255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04593v1","updated":"2024-07-05T15:41:30Z","published":"2024-07-05T15:41:30Z","title":"Testing learning hypotheses using neural networks by manipulating\n learning data","summary":" Although passivization is productive in English, it is not completely general\n-- some exceptions exist (e.g. *One hour was lasted by the meeting). How do\nEnglish speakers learn these exceptions to an otherwise general pattern? Using\nneural network language models as theories of acquisition, we explore the\nsources of indirect evidence that a learner can leverage to learn whether a\nverb can passivize. We first characterize English speakers' judgments of\nexceptions to the passive, confirming that speakers find some verbs more\npassivizable than others. We then show that a neural network language model can\nlearn restrictions to the passive that are similar to those displayed by\nhumans, suggesting that evidence for these exceptions is available in the\nlinguistic input. We test the causal role of two hypotheses for how the\nlanguage model learns these restrictions by training models on modified\ntraining corpora, which we create by altering the existing training corpora to\nremove features of the input implicated by each hypothesis. We find that while\nthe frequency with which a verb appears in the passive significantly affects\nits passivizability, the semantics of the verb does not. This study highlight\nthe utility of altering a language model's training data for answering\nquestions where complete control over a learner's input is vital.\n","authors":["Cara Su-Yi Leong","Tal Linzen"],"pdf_url":"https://arxiv.org/pdf/2407.04593v1.pdf","comment":"Submitted to Journal of Memory and Language"},{"id":"http://arxiv.org/abs/2403.13213v4","updated":"2024-07-05T15:40:13Z","published":"2024-03-20T00:22:38Z","title":"From Representational Harms to Quality-of-Service Harms: A Case Study on\n Llama 2 Safety Safeguards","summary":" Recent progress in large language models (LLMs) has led to their widespread\nadoption in various domains. However, these advancements have also introduced\nadditional safety risks and raised concerns regarding their detrimental impact\non already marginalized populations. Despite growing mitigation efforts to\ndevelop safety safeguards, such as supervised safety-oriented fine-tuning and\nleveraging safe reinforcement learning from human feedback, multiple concerns\nregarding the safety and ingrained biases in these models remain. Furthermore,\nprevious work has demonstrated that models optimized for safety often display\nexaggerated safety behaviors, such as a tendency to refrain from responding to\ncertain requests as a precautionary measure. As such, a clear trade-off between\nthe helpfulness and safety of these models has been documented in the\nliterature. In this paper, we further investigate the effectiveness of safety\nmeasures by evaluating models on already mitigated biases. Using the case of\nLlama 2 as an example, we illustrate how LLMs' safety responses can still\nencode harmful assumptions. To do so, we create a set of non-toxic prompts,\nwhich we then use to evaluate Llama models. Through our new taxonomy of LLMs\nresponses to users, we observe that the safety/helpfulness trade-offs are more\npronounced for certain demographic groups which can lead to quality-of-service\nharms for marginalized populations.\n","authors":["Khaoula Chehbouni","Megha Roshan","Emmanuel Ma","Futian Andrew Wei","Afaf Taik","Jackie CK Cheung","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2403.13213v4.pdf","comment":"9 pages, 4 figures. Accepted to Findings of the Association for\n Computational Linguistics: ACL 2024"},{"id":"http://arxiv.org/abs/2312.06681v4","updated":"2024-07-05T15:30:45Z","published":"2023-12-09T04:40:46Z","title":"Steering Llama 2 via Contrastive Activation Addition","summary":" We introduce Contrastive Activation Addition (CAA), an innovative method for\nsteering language models by modifying their activations during forward passes.\nCAA computes \"steering vectors\" by averaging the difference in residual stream\nactivations between pairs of positive and negative examples of a particular\nbehavior, such as factual versus hallucinatory responses. During inference,\nthese steering vectors are added at all token positions after the user's prompt\nwith either a positive or negative coefficient, allowing precise control over\nthe degree of the targeted behavior. We evaluate CAA's effectiveness on Llama 2\nChat using multiple-choice behavioral question datasets and open-ended\ngeneration tasks. We demonstrate that CAA significantly alters model behavior,\nis effective over and on top of traditional methods like finetuning and system\nprompt design, and minimally reduces capabilities. Moreover, we gain deeper\ninsights into CAA's mechanisms by employing various activation space\ninterpretation methods. CAA accurately steers model outputs and sheds light on\nhow high-level concepts are represented in Large Language Models (LLMs).\n","authors":["Nina Panickssery","Nick Gabrieli","Julian Schulz","Meg Tong","Evan Hubinger","Alexander Matt Turner"],"pdf_url":"https://arxiv.org/pdf/2312.06681v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13782v2","updated":"2024-07-05T15:24:03Z","published":"2023-08-26T06:28:14Z","title":"Planning with Logical Graph-based Language Model for Instruction\n Generation","summary":" Despite the superior performance of large language models to generate natural\nlanguage texts, it is hard to generate texts with correct logic according to a\ngiven task, due to the difficulties for neural models to capture implied rules\nfrom free-form texts. In this paper, we propose a novel graph-based language\nmodel, Logical-GLM, to infuse logic into language models for more valid text\ngeneration and interpretability. Specifically, we first capture information\nfrom natural language instructions and construct logical bayes graphs that\ngenerally describe domains. Next, we generate logical skeletons to guide\nlanguage model training, infusing domain knowledge into language models.\nFinally, we alternately optimize the searching policy of graphs and language\nmodels until convergence. The experimental results show that Logical-GLM is\nboth effective and efficient compared with traditional language models, despite\nusing smaller-scale training data and fewer parameters. Our approach can\ngenerate instructional texts with more correct logic owing to the internalized\ndomain knowledge. Moreover, the usage of logical graphs reflects the inner\nmechanism of the language models, which improves the interpretability of\nblack-box models.\n","authors":["Fan Zhang","Kebing Jin","Hankz Hankui Zhuo"],"pdf_url":"https://arxiv.org/pdf/2308.13782v2.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.09982v2","updated":"2024-07-05T15:22:58Z","published":"2024-04-15T17:57:30Z","title":"Memory Sharing for Large Language Model based Agents","summary":" The adaptation of Large Language Model (LLM)-based agents to execute tasks\nvia natural language prompts represents a significant advancement, notably\neliminating the need for explicit retraining or fine tuning, but are\nconstrained by the comprehensiveness and diversity of the provided examples,\nleading to outputs that often diverge significantly from expected results,\nespecially when it comes to the open-ended questions. This paper introduces the\nMemory Sharing, a framework which integrates the real-time memory filter,\nstorage and retrieval to enhance the In-Context Learning process. This\nframework allows for the sharing of memories among multiple agents, whereby the\ninteractions and shared memories between different agents effectively enhance\nthe diversity of the memories. The collective self-enhancement through\ninteractive learning among multiple agents facilitates the evolution from\nindividual intelligence to collective intelligence. Besides, the dynamically\ngrowing memory pool is utilized not only to improve the quality of responses\nbut also to train and enhance the retriever. We evaluated our framework across\nthree distinct domains involving specialized tasks of agents. The experimental\nresults demonstrate that the MS framework significantly improves the agents'\nperformance in addressing open-ended questions.\n","authors":["Hang Gao","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.09982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04573v1","updated":"2024-07-05T15:08:44Z","published":"2024-07-05T15:08:44Z","title":"VRSD: Rethinking Similarity and Diversity for Retrieval in Large\n Language Models","summary":" Vector retrieval algorithms are vital for semantic queries in the evolving\nlandscape of Large Language Models (LLMs). Retrieving vectors that\nsimultaneously meet criteria for both similarity and diversity significantly\nenhances the capabilities of LLM-based agents. Despite the widespread use of\nthe Maximal Marginal Relevance (MMR) in retrieval scenarios with relevance and\ndiversity requirements, fluctuations caused by variations in the parameter $\n\\lambda $ within the MMR complicate the determination of the optimization\ntrajectory in vector spaces, thus obscuring the direction of enhancement.\nMoreover, there is a lack of a robust theoretical analysis for the constraints\nof similarity and diversity in retrieval processes. This paper introduces a\nnovel approach to characterizing both constraints through the relationship\nbetween the sum vector and the query vector. The proximity of these vectors\naddresses the similarity constraint, while necessitating that individual\nvectors within the sum vector divergently align with the query vector to\nsatisfy the diversity constraint. We also formulate a new combinatorial\noptimization challenge, taking a selection of $k$ vectors from a set of\ncandidates such that their sum vector maximally aligns with the query vector, a\nproblem we demonstrate to be NP-complete. This establishes the profound\ndifficulty of pursuing similarity and diversity simultaneously in vector\nretrieval and lays a theoretical groundwork for further research. Additionally,\nwe present the heuristic algorithm Vectors Retrieval with Similarity and\nDiversity (VRSD) which not only has a definitive optimization goal and eschews\nthe need for preset parameters but also offers a modest reduction in time\ncomplexity compared to MMR. Empirical validation further confirm that VRSD\nsignificantly surpasses MMR across various datasets.\n","authors":["Hang Gao","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13191v2","updated":"2024-07-05T15:00:17Z","published":"2023-08-25T05:52:05Z","title":"Chunk, Align, Select: A Simple Long-sequence Processing Method for\n Transformers","summary":" Although dominant in natural language processing, transformer-based models\nremain challenged by the task of long-sequence processing, because the\ncomputational cost of self-attention operations in transformers swells\nquadratically with the input sequence length. To alleviate the complexity of\nlong-sequence processing, we propose a simple framework to enable the\noffthe-shelf pre-trained transformers to process much longer sequences, while\nthe computation and memory costs remain growing linearly with the input\nsequence lengths. More specifically, our method divides each long-sequence\ninput into a batch of chunks, then aligns the interchunk information during the\nencoding steps, and finally selects the most representative hidden states from\nthe encoder for the decoding process. To extract inter-chunk semantic\ninformation, we align the start and end token embeddings among chunks in each\nencoding transformer block. To learn an effective hidden selection policy, we\ndesign a dual updating scheme inspired by reinforcement learning, which regards\nthe decoders of transformers as environments, and the downstream performance\nmetrics as the rewards to evaluate the hidden selection actions. Our empirical\nresults on real-world long-text summarization and reading comprehension tasks\ndemonstrate effective improvements compared to prior longsequence processing\nbaselines.\n","authors":["Jiawen Xie","Pengyu Cheng","Xiao Liang","Yong Dai","Nan Du"],"pdf_url":"https://arxiv.org/pdf/2308.13191v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.04559v1","updated":"2024-07-05T14:48:15Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n than Measuring Coherence, Grounding, and Repetition","summary":" Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04549v1","updated":"2024-07-05T14:34:50Z","published":"2024-07-05T14:34:50Z","title":"Spontaneous Reward Hacking in Iterative Self-Refinement","summary":" Language models are capable of iteratively improving their outputs based on\nnatural language feedback, thus enabling in-context optimization of user\npreference. In place of human users, a second language model can be used as an\nevaluator, providing feedback along with numerical ratings which the generator\nattempts to optimize. However, because the evaluator is an imperfect proxy of\nuser preference, this optimization can lead to reward hacking, where the\nevaluator's ratings improve while the generation quality remains stagnant or\neven decreases as judged by actual user preference. The concern of reward\nhacking is heightened in iterative self-refinement where the generator and the\nevaluator use the same underlying language model, in which case the\noptimization pressure can drive them to exploit shared vulnerabilities. Using\nan essay editing task, we show that iterative self-refinement leads to\ndeviation between the language model evaluator and human judgment,\ndemonstrating that reward hacking can occur spontaneously in-context with the\nuse of iterative self-refinement. In addition, we study conditions under which\nreward hacking occurs and observe two factors that affect reward hacking\nseverity: model size and context sharing between the generator and the\nevaluator.\n","authors":["Jane Pan","He He","Samuel R. Bowman","Shi Feng"],"pdf_url":"https://arxiv.org/pdf/2407.04549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04543v1","updated":"2024-07-05T14:29:44Z","published":"2024-07-05T14:29:44Z","title":"Strengthening Structural Inductive Biases by Pre-training to Perform\n Syntactic Transformations","summary":" Models need appropriate inductive biases to effectively learn from small\namounts of data and generalize systematically outside of the training\ndistribution. While Transformers are highly versatile and powerful, they can\nstill benefit from enhanced structural inductive biases for seq2seq tasks,\nespecially those involving syntactic transformations, such as converting active\nto passive voice or semantic parsing. In this paper, we propose to strengthen\nthe structural inductive bias of a Transformer by intermediate pre-training to\nperform synthetically generated syntactic transformations of dependency trees\ngiven a description of the transformation. Our experiments confirm that this\nhelps with few-shot learning of syntactic tasks such as chunking, and also\nimproves structural generalization for semantic parsing. Our analysis shows\nthat the intermediate pre-training leads to attention heads that keep track of\nwhich syntactic transformation needs to be applied to which token, and that the\nmodel can leverage these attention heads on downstream tasks.\n","authors":["Matthias Lindemann","Alexander Koller","Ivan Titov"],"pdf_url":"https://arxiv.org/pdf/2407.04543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04541v1","updated":"2024-07-05T14:28:12Z","published":"2024-07-05T14:28:12Z","title":"PoPreRo: A New Dataset for Popularity Prediction of Romanian Reddit\n Posts","summary":" We introduce PoPreRo, the first dataset for Popularity Prediction of Romanian\nposts collected from Reddit. The PoPreRo dataset includes a varied compilation\nof post samples from five distinct subreddits of Romania, totaling 28,107 data\nsamples. Along with our novel dataset, we introduce a set of competitive models\nto be used as baselines for future research. Interestingly, the top-scoring\nmodel achieves an accuracy of 61.35% and a macro F1 score of 60.60% on the test\nset, indicating that the popularity prediction task on PoPreRo is very\nchallenging. Further investigations based on few-shot prompting the Falcon-7B\nLarge Language Model also point in the same direction. We thus believe that\nPoPreRo is a valuable resource that can be used to evaluate models on\npredicting the popularity of social media posts in Romanian. We release our\ndataset at https://github.com/ana-rogoz/PoPreRo.\n","authors":["Ana-Cristina Rogoz","Maria Ilinca Nechita","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2407.04541v1.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2407.04533v1","updated":"2024-07-05T14:21:36Z","published":"2024-07-05T14:21:36Z","title":"Performance Analysis of Speech Encoders for Low-Resource SLU and ASR in\n Tunisian Dialect","summary":" Speech encoders pretrained through self-supervised learning (SSL) have\ndemonstrated remarkable performance in various downstream tasks, including\nSpoken Language Understanding (SLU) and Automatic Speech Recognition (ASR). For\ninstance, fine-tuning SSL models for such tasks has shown significant\npotential, leading to improvements in the SOTA performance across challenging\ndatasets. In contrast to existing research, this paper contributes by comparing\nthe effectiveness of SSL approaches in the context of (i) the low-resource\nspoken Tunisian Arabic dialect and (ii) its combination with a low-resource SLU\nand ASR scenario, where only a few semantic annotations are available for\nfine-tuning. We conduct experiments using many SSL speech encoders on the\nTARIC-SLU dataset. We use speech encoders that were pre-trained on either\nmonolingual or multilingual speech data. Some of them have also been refined\nwithout in-domain nor Tunisian data through multimodal supervised\nteacher-student paradigm. This study yields numerous significant findings that\nwe are discussing in this paper.\n","authors":["Salima Mdhaffar","Haroun Elleuch","Fethi Bougares","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2407.04533v1.pdf","comment":"Accepted in ArabicNLP 2024"},{"id":"http://arxiv.org/abs/2407.04528v1","updated":"2024-07-05T14:16:47Z","published":"2024-07-05T14:16:47Z","title":"GPT vs RETRO: Exploring the Intersection of Retrieval and\n Parameter-Efficient Fine-Tuning","summary":" Parameter-Efficient Fine-Tuning (PEFT) and Retrieval-Augmented Generation\n(RAG) have become popular methods for adapting large language models while\nminimizing compute requirements. In this paper, we apply PEFT methods\n(P-tuning, Adapters, and LoRA) to a modified Retrieval-Enhanced Transformer\n(RETRO) and a baseline GPT model across several sizes, ranging from 823 million\nto 48 billion parameters. We show that RETRO models outperform GPT models in\nzero-shot settings due to their unique pre-training process but GPT models have\nhigher performance potential with PEFT. Additionally, our study indicates that\n8B parameter models strike an optimal balance between cost and performance and\nP-tuning lags behind other PEFT techniques. We further provide a comparative\nanalysis of between applying PEFT to an Instruction-tuned RETRO model and base\nRETRO model. This work presents the first comprehensive comparison of various\nPEFT methods integrated with RAG, applied to both GPT and RETRO models,\nhighlighting their relative performance.\n","authors":["Aleksander Ficek","Jiaqi Zeng","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2407.04528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11924v4","updated":"2024-07-05T13:43:43Z","published":"2024-02-19T08:12:30Z","title":"Evaluating LLMs' Inherent Multi-hop Reasoning Ability","summary":" While Large Language Models (LLMs) excel in question-answering (QA) tasks,\ntheir multi-step reasoning abilities on multiple evidence integration on\nMulti-hop QA tasks remain underexplored. LLMs sometimes generate answers that\nrely on internal memory rather than reasoning given context, which brings\nconcerns about the evaluation quality of real reasoning abilities. The\ncounterfactual QA task can separate internal memory from reasoning abilities,\nbut focusing solely on final-QA performance without evaluating the multi-step\nreasoning process is insufficient for reporting LLMs' real reasoning abilities.\nCurrent Multi-hop QA (MHQA) benchmarks are factual and annotated on open-source\ncorpora such as Wikipedia, although useful for multi-step reasoning evaluation,\nshowing limitations due to potential data contamination in LLMs pre-training\nstage. To address this issue, we introduce the Inherent Reasoning Evaluation\n(IRE) method, a novel evaluation way that jointly evaluates the LLMs'\nchain-of-reasoning performance based on the first knowledge-edited\ncounterfactual multi-hop QA data which involves editing the original Wikipedia\npassages, reducing data contamination risks. The IRE comprehensively assesses\nreasoning chains through sub-QA and final-QA evaluations. Our comparisons\nreveal significant performance gaps for several LLMs between Wikipedia-based\nbenchmarks and IRE, deeming data contamination issues in existing benchmarks.\nWe believe that the IRE benchmark will enhance and facilitate trustworthy LLM\nevaluations.\n","authors":["Jian Wu","Linyi Yang","Zhen Wang","Manabu Okumura","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.11924v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04485v1","updated":"2024-07-05T13:08:58Z","published":"2024-07-05T13:08:58Z","title":"Leveraging Graph Structures to Detect Hallucinations in Large Language\n Models","summary":" Large language models are extensively applied across a wide range of tasks,\nsuch as customer support, content creation, educational tutoring, and providing\nfinancial guidance. However, a well-known drawback is their predisposition to\ngenerate hallucinations. This damages the trustworthiness of the information\nthese models provide, impacting decision-making and user confidence. We propose\na method to detect hallucinations by looking at the structure of the latent\nspace and finding associations within hallucinated and non-hallucinated\ngenerations. We create a graph structure that connects generations that lie\nclosely in the embedding space. Moreover, we employ a Graph Attention Network\nwhich utilizes message passing to aggregate information from neighboring nodes\nand assigns varying degrees of importance to each neighbor based on their\nrelevance. Our findings show that 1) there exists a structure in the latent\nspace that differentiates between hallucinated and non-hallucinated\ngenerations, 2) Graph Attention Networks can learn this structure and\ngeneralize it to unseen generations, and 3) the robustness of our method is\nenhanced when incorporating contrastive learning. When evaluated against\nevidence-based benchmarks, our model performs similarly without access to\nsearch-based methods.\n","authors":["Noa Nonkes","Sergei Agaronian","Evangelos Kanoulas","Roxana Petcu"],"pdf_url":"https://arxiv.org/pdf/2407.04485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04482v1","updated":"2024-07-05T13:04:31Z","published":"2024-07-05T13:04:31Z","title":"Controlling Whisper: Universal Acoustic Adversarial Attacks to Control\n Speech Foundation Models","summary":" Speech enabled foundation models, either in the form of flexible speech\nrecognition based systems or audio-prompted large language models (LLMs), are\nbecoming increasingly popular. One of the interesting aspects of these models\nis their ability to perform tasks other than automatic speech recognition (ASR)\nusing an appropriate prompt. For example, the OpenAI Whisper model can perform\nboth speech transcription and speech translation. With the development of\naudio-prompted LLMs there is the potential for even greater control options. In\nthis work we demonstrate that with this greater flexibility the systems can be\nsusceptible to model-control adversarial attacks. Without any access to the\nmodel prompt it is possible to modify the behaviour of the system by\nappropriately changing the audio input. To illustrate this risk, we demonstrate\nthat it is possible to prepend a short universal adversarial acoustic segment\nto any input speech signal to override the prompt setting of an ASR foundation\nmodel. Specifically, we successfully use a universal adversarial acoustic\nsegment to control Whisper to always perform speech translation, despite being\nset to perform speech transcription. Overall, this work demonstrates a new form\nof adversarial attack on multi-tasking speech enabled foundation models that\nneeds to be considered prior to the deployment of this form of model.\n","authors":["Vyas Raina","Mark Gales"],"pdf_url":"https://arxiv.org/pdf/2407.04482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18991v2","updated":"2024-07-05T13:01:07Z","published":"2024-05-29T11:11:07Z","title":"EasyAnimate: A High-Performance Long Video Generation Method based on\n Transformer Architecture","summary":" This paper presents EasyAnimate, an advanced method for video generation that\nleverages the power of transformer architecture for high-performance outcomes.\nWe have expanded the DiT framework originally designed for 2D image synthesis\nto accommodate the complexities of 3D video generation by incorporating a\nmotion module block. It is used to capture temporal dynamics, thereby ensuring\nthe production of consistent frames and seamless motion transitions. The motion\nmodule can be adapted to various DiT baseline methods to generate video with\ndifferent styles. It can also generate videos with different frame rates and\nresolutions during both training and inference phases, suitable for both images\nand videos. Moreover, we introduce slice VAE, a novel approach to condense the\ntemporal axis, facilitating the generation of long duration videos. Currently,\nEasyAnimate exhibits the proficiency to generate videos with 144 frames. We\nprovide a holistic ecosystem for video production based on DiT, encompassing\naspects such as data pre-processing, VAE training, DiT models training (both\nthe baseline model and LoRA model), and end-to-end video inference. Code is\navailable at: https://github.com/aigc-apps/EasyAnimate. We are continuously\nworking to enhance the performance of our method.\n","authors":["Jiaqi Xu","Xinyi Zou","Kunzhe Huang","Yunkuo Chen","Bo Liu","MengLi Cheng","Xing Shi","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2405.18991v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.04472v1","updated":"2024-07-05T12:42:31Z","published":"2024-07-05T12:42:31Z","title":"EventChat: Implementation and user-centric evaluation of a large\n language model-driven conversational recommender system for exploring leisure\n events in an SME context","summary":" Large language models (LLMs) present an enormous evolution in the strategic\npotential of conversational recommender systems (CRS). Yet to date, research\nhas predominantly focused upon technical frameworks to implement LLM-driven\nCRS, rather than end-user evaluations or strategic implications for firms,\nparticularly from the perspective of a small to medium enterprises (SME) that\nmakeup the bedrock of the global economy. In the current paper, we detail the\ndesign of an LLM-driven CRS in an SME setting, and its subsequent performance\nin the field using both objective system metrics and subjective user\nevaluations. While doing so, we additionally outline a short-form revised\nResQue model for evaluating LLM-driven CRS, enabling replicability in a rapidly\nevolving field. Our results reveal good system performance from a user\nexperience perspective (85.5% recommendation accuracy) but underscore latency,\ncost, and quality issues challenging business viability. Notably, with a median\ncost of $0.04 per interaction and a latency of 5.7s, cost-effectiveness and\nresponse time emerge as crucial areas for achieving a more user-friendly and\neconomically viable LLM-driven CRS for SME settings. One major driver of these\ncosts is the use of an advanced LLM as a ranker within the retrieval-augmented\ngeneration (RAG) technique. Our results additionally indicate that relying\nsolely on approaches such as Prompt-based learning with ChatGPT as the\nunderlying LLM makes it challenging to achieve satisfying quality in a\nproduction environment. Strategic considerations for SMEs deploying an\nLLM-driven CRS are outlined, particularly considering trade-offs in the current\ntechnical landscape.\n","authors":["Hannes Kunstmann","Joseph Ollier","Joel Persson","Florian von Wangenheim"],"pdf_url":"https://arxiv.org/pdf/2407.04472v1.pdf","comment":"27 pages, 3 tables, 5 figures, pre-print manuscript"},{"id":"http://arxiv.org/abs/2407.04467v1","updated":"2024-07-05T12:30:02Z","published":"2024-07-05T12:30:02Z","title":"Are Large Language Models Strategic Decision Makers? A Study of\n Performance and Bias in Two-Player Non-Zero-Sum Games","summary":" Large Language Models (LLMs) have been increasingly used in real-world\nsettings, yet their strategic abilities remain largely unexplored. Game theory\nprovides a good framework for assessing the decision-making abilities of LLMs\nin interactions with other agents. Although prior studies have shown that LLMs\ncan solve these tasks with carefully curated prompts, they fail when the\nproblem setting or prompt changes. In this work we investigate LLMs' behaviour\nin strategic games, Stag Hunt and Prisoner Dilemma, analyzing performance\nvariations under different settings and prompts. Our results show that the\ntested state-of-the-art LLMs exhibit at least one of the following systematic\nbiases: (1) positional bias, (2) payoff bias, or (3) behavioural bias.\nSubsequently, we observed that the LLMs' performance drops when the game\nconfiguration is misaligned with the affecting biases. Performance is assessed\nbased on the selection of the correct action, one which agrees with the\nprompted preferred behaviours of both players. Alignment refers to whether the\nLLM's bias aligns with the correct action. For example, GPT-4o's average\nperformance drops by 34% when misaligned. Additionally, the current trend of\n\"bigger and newer is better\" does not hold for the above, where GPT-4o (the\ncurrent best-performing LLM) suffers the most substantial performance drop.\nLastly, we note that while chain-of-thought prompting does reduce the effect of\nthe biases on most models, it is far from solving the problem at the\nfundamental level.\n","authors":["Nathan Herr","Fernando Acero","Roberta Raileanu","María Pérez-Ortiz","Zhibin Li"],"pdf_url":"https://arxiv.org/pdf/2407.04467v1.pdf","comment":"8 pages (19 with appendix), 6 figures in the main body (4 in the\n appendix), 4 tables in the main body"},{"id":"http://arxiv.org/abs/2407.04466v1","updated":"2024-07-05T12:30:01Z","published":"2024-07-05T12:30:01Z","title":"Using LLMs to label medical papers according to the CIViC evidence model","summary":" We introduce the sequence classification problem CIViC Evidence to the field\nof medical NLP. CIViC Evidence denotes the multi-label classification problem\nof assigning labels of clinical evidence to abstracts of scientific papers\nwhich have examined various combinations of genomic variants, cancer types, and\ntreatment approaches. We approach CIViC Evidence using different language\nmodels: We fine-tune pretrained checkpoints of BERT and RoBERTa on the CIViC\nEvidence dataset and challenge their performance with models of the same\narchitecture which have been pretrained on domain-specific text. In this\ncontext, we find that BiomedBERT and BioLinkBERT can outperform BERT on CIViC\nEvidence (+0.8% and +0.9% absolute improvement in class-support weighted F1\nscore). All transformer-based models show a clear performance edge when\ncompared to a logistic regression trained on bigram tf-idf scores (+1.5 - 2.7%\nimproved F1 score). We compare the aforementioned BERT-like models to OpenAI's\nGPT-4 in a few-shot setting (on a small subset of our original test dataset),\ndemonstrating that, without additional prompt-engineering or fine-tuning, GPT-4\nperforms worse on CIViC Evidence than our six fine-tuned models (66.1% weighted\nF1 score compared to 71.8% for the best fine-tuned model). However, performance\ngets reasonably close to the benchmark of a logistic regression model trained\non bigram tf-idf scores (67.7% weighted F1 score).\n","authors":["Markus Hisch","Xing David Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04459v1","updated":"2024-07-05T12:09:40Z","published":"2024-07-05T12:09:40Z","title":"Generalists vs. Specialists: Evaluating Large Language Models for Urdu","summary":" In this paper, we compare general-purpose pretrained models, GPT-4-Turbo and\nLlama-3-8b-Instruct with special-purpose models fine-tuned on specific tasks,\nXLM-Roberta-large, mT5-large, and Llama-3-8b-Instruct. We focus on seven\nclassification and six generation tasks to evaluate the performance of these\nmodels on Urdu language. Urdu has 70 million native speakers, yet it remains\nunderrepresented in Natural Language Processing (NLP). Despite the frequent\nadvancements in Large Language Models (LLMs), their performance in low-resource\nlanguages, including Urdu, still needs to be explored. We also conduct a human\nevaluation for the generation tasks and compare the results with the\nevaluations performed by GPT-4-Turbo and Llama-3-8b-Instruct. We find that\nspecial-purpose models consistently outperform general-purpose models across\nvarious tasks. We also find that the evaluation done by GPT-4-Turbo for\ngeneration tasks aligns more closely with human evaluation compared to the\nevaluation by Llama-3-8b-Instruct. This paper contributes to the NLP community\nby providing insights into the effectiveness of general and specific-purpose\nLLMs for low-resource languages.\n","authors":["Samee Arif","Abdul Hameed Azeemi","Agha Ali Raza","Awais Athar"],"pdf_url":"https://arxiv.org/pdf/2407.04459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11534v3","updated":"2024-07-05T12:06:26Z","published":"2023-04-23T04:21:50Z","title":"Graph Neural Networks for Text Classification: A Survey","summary":" Text Classification is the most essential and fundamental problem in Natural\nLanguage Processing. While numerous recent text classification models applied\nthe sequential deep learning technique, graph neural network-based models can\ndirectly deal with complex structured text data and exploit global information.\nMany real text classification applications can be naturally cast into a graph,\nwhich captures words, documents, and corpus global features. In this survey, we\nbring the coverage of methods up to 2023, including corpus-level and\ndocument-level graph neural networks. We discuss each of these methods in\ndetail, dealing with the graph construction mechanisms and the graph-based\nlearning process. As well as the technological survey, we look at issues behind\nand future directions addressed in text classification using graph neural\nnetworks. We also cover datasets, evaluation metrics, and experiment design and\npresent a summary of published performance on the publicly available\nbenchmarks. Note that we present a comprehensive comparison between different\ntechniques and identify the pros and cons of various evaluation metrics in this\nsurvey.\n","authors":["Kunze Wang","Yihao Ding","Soyeon Caren Han"],"pdf_url":"https://arxiv.org/pdf/2304.11534v3.pdf","comment":"28 pages, published in Artificial Intelligence Review"},{"id":"http://arxiv.org/abs/2407.04444v1","updated":"2024-07-05T11:54:38Z","published":"2024-07-05T11:54:38Z","title":"TokenVerse: Unifying Speech and NLP Tasks via Transducer-based ASR","summary":" In traditional conversational intelligence from speech, a cascaded pipeline\nis used, involving tasks such as voice activity detection, diarization,\ntranscription, and subsequent processing with different NLP models for tasks\nlike semantic endpointing and named entity recognition (NER). Our paper\nintroduces TokenVerse, a single Transducer-based model designed to handle\nmultiple tasks. This is achieved by integrating task-specific tokens into the\nreference text during ASR model training, streamlining the inference and\neliminating the need for separate NLP models. In addition to ASR, we conduct\nexperiments on 3 different tasks: speaker change detection, endpointing, and\nNER. Our experiments on a public and a private dataset show that the proposed\nmethod improves ASR by up to 7.7% in relative WER while outperforming the\ncascaded pipeline approach in individual task performance. Additionally, we\npresent task transfer learning to a new task within an existing TokenVerse.\n","authors":["Shashi Kumar","Srikanth Madikeri","Juan Zuluaga-Gomez","Iuliia Nigmatulina","Esaú Villatoro-Tello","Sergio Burdisso","Petr Motlicek","Karthik Pandia","Aravind Ganapathiraju"],"pdf_url":"https://arxiv.org/pdf/2407.04444v1.pdf","comment":"5 pages, double column"},{"id":"http://arxiv.org/abs/2406.06399v2","updated":"2024-07-05T11:47:31Z","published":"2024-06-10T15:52:49Z","title":"Should We Fine-Tune or RAG? Evaluating Different Techniques to Adapt\n LLMs for Dialogue","summary":" We study the limitations of Large Language Models (LLMs) for the task of\nresponse generation in human-machine dialogue. Several techniques have been\nproposed in the literature for different dialogue types (e.g., Open-Domain).\nHowever, the evaluations of these techniques have been limited in terms of base\nLLMs, dialogue types and evaluation metrics. In this work, we extensively\nanalyze different LLM adaptation techniques when applied to different dialogue\ntypes. We have selected two base LLMs, Llama-2 and Mistral, and four dialogue\ntypes Open-Domain, Knowledge-Grounded, Task-Oriented, and Question Answering.\nWe evaluate the performance of in-context learning and fine-tuning techniques\nacross datasets selected for each dialogue type. We assess the impact of\nincorporating external knowledge to ground the generation in both scenarios of\nRetrieval-Augmented Generation (RAG) and gold knowledge. We adopt consistent\nevaluation and explainability criteria for automatic metrics and human\nevaluation protocols. Our analysis shows that there is no universal\nbest-technique for adapting large language models as the efficacy of each\ntechnique depends on both the base LLM and the specific type of dialogue. Last\nbut not least, the assessment of the best adaptation technique should include\nhuman evaluation to avoid false expectations and outcomes derived from\nautomatic metrics.\n","authors":["Simone Alghisi","Massimo Rizzoli","Gabriel Roccabruna","Seyed Mahed Mousavi","Giuseppe Riccardi"],"pdf_url":"https://arxiv.org/pdf/2406.06399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04434v1","updated":"2024-07-05T11:31:30Z","published":"2024-07-05T11:31:30Z","title":"From 'Showgirls' to 'Performers': Fine-tuning with Gender-inclusive\n Language for Bias Reduction in LLMs","summary":" Gender bias is not only prevalent in Large Language Models (LLMs) and their\ntraining data, but also firmly ingrained into the structural aspects of\nlanguage itself. Therefore, adapting linguistic structures within LLM training\ndata to promote gender-inclusivity can make gender representations within the\nmodel more inclusive. The focus of our work are gender-exclusive affixes in\nEnglish, such as in 'show-girl' or 'man-cave', which can perpetuate gender\nstereotypes and binary conceptions of gender. We use an LLM training dataset to\ncompile a catalogue of 692 gender-exclusive terms along with gender-neutral\nvariants and from this, develop a gender-inclusive fine-tuning dataset, the\n'Tiny Heap'. Fine-tuning three different LLMs with this dataset, we observe an\noverall reduction in gender-stereotyping tendencies across the models. Our\napproach provides a practical method for enhancing gender inclusivity in LLM\ntraining data and contributes to incorporating queer-feminist linguistic\nactivism in bias mitigation research in NLP.\n","authors":["Marion Bartl","Susan Leavy"],"pdf_url":"https://arxiv.org/pdf/2407.04434v1.pdf","comment":"10 pages, 5 tables; to appear in Proceedings of the 5th Workshop on\n Gender Bias in Natural Language Processing at ACL 2024"},{"id":"http://arxiv.org/abs/2402.11997v2","updated":"2024-07-05T11:26:51Z","published":"2024-02-19T09:43:03Z","title":"Remember This Event That Year? Assessing Temporal Information and\n Reasoning in Large Language Models","summary":" Large Language Models (LLMs) are increasingly ubiquitous, yet their ability\nto retain and reason about temporal information remains limited, hindering\ntheir application in real-world scenarios where understanding the sequential\nnature of events is crucial. Our study experiments with 12 state-of-the-art\nmodels (ranging from 2B to 70B+ parameters) on a novel numerical-temporal\ndataset, \\textbf{TempUN}, spanning from 10,000 BCE to 2100 CE, to uncover\nsignificant temporal retention and comprehension limitations. We propose six\nmetrics to assess three learning paradigms to enhance temporal knowledge\nacquisition. Our findings reveal that open-source models exhibit knowledge gaps\nmore frequently, suggesting a trade-off between limited knowledge and incorrect\nresponses. Additionally, various fine-tuning approaches significantly improved\nperformance, reducing incorrect outputs and impacting the identification of\n'information not available' in the generations. The associated dataset and code\nare available at (https://github.com/lingoiitgn/TempUN).\n","authors":["Himanshu Beniwal","Dishant Patel","Kowsik Nandagopan D","Hritik Ladia","Ankit Yadav","Mayank Singh"],"pdf_url":"https://arxiv.org/pdf/2402.11997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04411v1","updated":"2024-07-05T10:51:33Z","published":"2024-07-05T10:51:33Z","title":"Waterfall: Framework for Robust and Scalable Text Watermarking","summary":" Protecting intellectual property (IP) of text such as articles and code is\nincreasingly important, especially as sophisticated attacks become possible,\nsuch as paraphrasing by large language models (LLMs) or even unauthorized\ntraining of LLMs on copyrighted text to infringe such IP. However, existing\ntext watermarking methods are not robust enough against such attacks nor\nscalable to millions of users for practical implementation. In this paper, we\npropose Waterfall, the first training-free framework for robust and scalable\ntext watermarking applicable across multiple text types (e.g., articles, code)\nand languages supportable by LLMs, for general text and LLM data provenance.\nWaterfall comprises several key innovations, such as being the first to use LLM\nas paraphrasers for watermarking along with a novel combination of techniques\nthat are surprisingly effective in achieving robust verifiability and\nscalability. We empirically demonstrate that Waterfall achieves significantly\nbetter scalability, robust verifiability, and computational efficiency compared\nto SOTA article-text watermarking methods, and also showed how it could be\ndirectly applied to the watermarking of code.\n","authors":["Gregory Kang Ruey Lau","Xinyuan Niu","Hieu Dao","Jiangwei Chen","Chuan-Sheng Foo","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2407.04411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18064v2","updated":"2024-07-05T09:46:33Z","published":"2024-06-26T04:49:41Z","title":"Evaluating Quality of Answers for Retrieval-Augmented Generation: A\n Strong LLM Is All You Need","summary":" We present a comprehensive study of answer quality evaluation in\nRetrieval-Augmented Generation (RAG) applications using vRAG-Eval, a novel\ngrading system that is designed to assess correctness, completeness, and\nhonesty. We further map the grading of quality aspects aforementioned into a\nbinary score, indicating an accept or reject decision, mirroring the intuitive\n\"thumbs-up\" or \"thumbs-down\" gesture commonly used in chat applications. This\napproach suits factual business settings where a clear decision opinion is\nessential. Our assessment applies vRAG-Eval to two Large Language Models\n(LLMs), evaluating the quality of answers generated by a vanilla RAG\napplication. We compare these evaluations with human expert judgments and find\na substantial alignment between GPT-4's assessments and those of human experts,\nreaching 83% agreement on accept or reject decisions. This study highlights the\npotential of LLMs as reliable evaluators in closed-domain, closed-ended\nsettings, particularly when human evaluations require significant resources.\n","authors":["Yang Wang","Alberto Garcia Hernandez","Roman Kyslyi","Nicholas Kersting"],"pdf_url":"https://arxiv.org/pdf/2406.18064v2.pdf","comment":"13 pages, 8 figures, 12 tables"},{"id":"http://arxiv.org/abs/2311.09684v3","updated":"2024-07-05T09:14:11Z","published":"2023-11-16T08:54:52Z","title":"Do Physicians Know How to Prompt? The Need for Automatic Prompt\n Optimization Help in Clinical Note Generation","summary":" This study examines the effect of prompt engineering on the performance of\nLarge Language Models (LLMs) in clinical note generation. We introduce an\nAutomatic Prompt Optimization (APO) framework to refine initial prompts and\ncompare the outputs of medical experts, non-medical experts, and APO-enhanced\nGPT3.5 and GPT4. Results highlight GPT4 APO's superior performance in\nstandardizing prompt quality across clinical note sections. A human-in-the-loop\napproach shows that experts maintain content quality post-APO, with a\npreference for their own modifications, suggesting the value of expert\ncustomization. We recommend a two-phase optimization process, leveraging\nAPO-GPT4 for consistency and expert input for personalization.\n","authors":["Zonghai Yao","Ahmed Jaafar","Beining Wang","Zhichao Yang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2311.09684v3.pdf","comment":"To appear in BioNLP 2024"},{"id":"http://arxiv.org/abs/2407.04368v1","updated":"2024-07-05T09:13:24Z","published":"2024-07-05T09:13:24Z","title":"Romanization Encoding For Multilingual ASR","summary":" We introduce romanization encoding for script-heavy languages to optimize\nmultilingual and code-switching Automatic Speech Recognition (ASR) systems. By\nadopting romanization encoding alongside a balanced concatenated tokenizer\nwithin a FastConformer-RNNT framework equipped with a Roman2Char module, we\nsignificantly reduce vocabulary and output dimensions, enabling larger training\nbatches and reduced memory consumption. Our method decouples acoustic modeling\nand language modeling, enhancing the flexibility and adaptability of the\nsystem. In our study, applying this method to Mandarin-English ASR resulted in\na remarkable 63.51% vocabulary reduction and notable performance gains of\n13.72% and 15.03% on SEAME code-switching benchmarks. Ablation studies on\nMandarin-Korean and Mandarin-Japanese highlight our method's strong capability\nto address the complexities of other script-heavy languages, paving the way for\nmore versatile and effective multilingual ASR systems.\n","authors":["Wen Ding","Fei Jia","Hainan Xu","Yu Xi","Junjie Lai","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2407.04368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09180v3","updated":"2024-07-05T08:51:16Z","published":"2022-05-18T18:57:36Z","title":"Learning Rate Curriculum","summary":" Most curriculum learning methods require an approach to sort the data samples\nby difficulty, which is often cumbersome to perform. In this work, we propose a\nnovel curriculum learning approach termed Learning Rate Curriculum (LeRaC),\nwhich leverages the use of a different learning rate for each layer of a neural\nnetwork to create a data-agnostic curriculum during the initial training\nepochs. More specifically, LeRaC assigns higher learning rates to neural layers\ncloser to the input, gradually decreasing the learning rates as the layers are\nplaced farther away from the input. The learning rates increase at various\npaces during the first training iterations, until they all reach the same\nvalue. From this point on, the neural model is trained as usual. This creates a\nmodel-level curriculum learning strategy that does not require sorting the\nexamples by difficulty and is compatible with any neural network, generating\nhigher performance levels regardless of the architecture. We conduct\ncomprehensive experiments on 12 data sets from the computer vision (CIFAR-10,\nCIFAR-100, Tiny ImageNet, ImageNet-200, Food-101, UTKFace, PASCAL VOC),\nlanguage (BoolQ, QNLI, RTE) and audio (ESC-50, CREMA-D) domains, considering\nvarious convolutional (ResNet-18, Wide-ResNet-50, DenseNet-121, YOLOv5),\nrecurrent (LSTM) and transformer (CvT, BERT, SepTr) architectures. We compare\nour approach with the conventional training regime, as well as with Curriculum\nby Smoothing (CBS), a state-of-the-art data-agnostic curriculum learning\napproach. Unlike CBS, our performance improvements over the standard training\nregime are consistent across all data sets and models. Furthermore, we\nsignificantly surpass CBS in terms of training time (there is no additional\ncost over the standard training regime for LeRaC). Our code is freely available\nat: https://github.com/CroitoruAlin/LeRaC.\n","authors":["Florinel-Alin Croitoru","Nicolae-Catalin Ristea","Radu Tudor Ionescu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2205.09180v3.pdf","comment":"Accepted at the International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2402.09631v6","updated":"2024-07-05T08:14:29Z","published":"2024-02-15T00:20:30Z","title":"Representation Surgery: Theory and Practice of Affine Steering","summary":" Language models often exhibit undesirable behavior, e.g., generating toxic or\ngender-biased text. In the case of neural language models, an encoding of the\nundesirable behavior is often present in the model's representations. Thus, one\nnatural (and common) approach to prevent the model from exhibiting undesirable\nbehavior is to steer the model's representations in a manner that reduces the\nprobability of it generating undesirable text. This paper investigates the\nformal and empirical properties of steering functions, i.e., transformation of\nthe neural language model's representations that alter its behavior. First, we\nderive two optimal, in the least-squares sense, affine steering functions under\ndifferent constraints. Our theory provides justification for existing\napproaches and offers a novel, improved steering approach. Second, we offer a\nseries of experiments that demonstrate the empirical effectiveness of the\nmethods in mitigating bias and reducing toxic generation.\n","authors":["Shashwat Singh","Shauli Ravfogel","Jonathan Herzig","Roee Aharoni","Ryan Cotterell","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2402.09631v6.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2406.14191v2","updated":"2024-07-05T07:38:02Z","published":"2024-06-20T10:51:06Z","title":"Temporal Knowledge Graph Question Answering: A Survey","summary":" Knowledge Base Question Answering (KBQA) has been a long-standing field to\nanswer questions based on knowledge bases. Recently, the evolving dynamics of\nknowledge have attracted a growing interest in Temporal Knowledge Graph\nQuestion Answering (TKGQA), an emerging task to answer temporal questions.\nHowever, this field grapples with ambiguities in defining temporal questions\nand lacks a systematic categorization of existing methods for TKGQA. In\nresponse, this paper provides a thorough survey from two perspectives: the\ntaxonomy of temporal questions and the methodological categorization for TKGQA.\nSpecifically, we first establish a detailed taxonomy of temporal questions\nengaged in prior studies. Subsequently, we provide a comprehensive review of\nTKGQA techniques of two categories: semantic parsing-based and TKG\nembedding-based. Building on this review, the paper outlines potential research\ndirections aimed at advancing the field of TKGQA. This work aims to serve as a\ncomprehensive reference for TKGQA and to stimulate further research.\n","authors":["Miao Su","Zixuan Li","Zhuo Chen","Long Bai","Xiaolong Jin","Jiafeng Guo"],"pdf_url":"https://arxiv.org/pdf/2406.14191v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.04307v1","updated":"2024-07-05T07:22:44Z","published":"2024-07-05T07:22:44Z","title":"Crafting Large Language Models for Enhanced Interpretability","summary":" We introduce the Concept Bottleneck Large Language Model (CB-LLM), a\npioneering approach to creating inherently interpretable Large Language Models\n(LLMs). Unlike traditional black-box LLMs that rely on post-hoc interpretation\nmethods with limited neuron function insights, CB-LLM sets a new standard with\nits built-in interpretability, scalability, and ability to provide clear,\naccurate explanations. This innovation not only advances transparency in\nlanguage models but also enhances their effectiveness. Our unique Automatic\nConcept Correction (ACC) strategy successfully narrows the performance gap with\nconventional black-box LLMs, positioning CB-LLM as a model that combines the\nhigh accuracy of traditional LLMs with the added benefit of clear\ninterpretability -- a feature markedly absent in existing LLMs.\n","authors":["Chung-En Sun","Tuomas Oikarinen","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2407.04307v1.pdf","comment":"Present at ICML 2024 Mechanistic Interpretability (MI) Workshop"},{"id":"http://arxiv.org/abs/2403.20041v3","updated":"2024-07-05T07:18:42Z","published":"2024-03-29T08:26:53Z","title":"Transformer-Lite: High-efficiency Deployment of Large Language Models on\n Mobile Phone GPUs","summary":" The Large Language Model (LLM) is widely employed for tasks such as\nintelligent assistants, text summarization, translation, and multi-modality on\nmobile phones. However, the current methods for on-device LLM deployment\nmaintain slow inference speed, which causes poor user experience. To facilitate\nhigh-efficiency LLM deployment on device GPUs, we propose four optimization\ntechniques: (a) a symbolic expression-based approach to support dynamic shape\nmodel inference; (b) operator optimizations and execution priority setting to\nenhance inference speed and reduce phone lagging; (c) an FP4 quantization\nmethod termed M0E4 to reduce dequantization overhead; (d) a sub-tensor-based\ntechnique to eliminate the need for copying KV cache after LLM inference.\nFurthermore, we implement these methods in our mobile inference engine,\nTransformer-Lite, which is compatible with both Qualcomm and MTK processors. We\nevaluated Transformer-Lite's performance using LLMs with varied architectures\nand parameters ranging from 2B to 14B. Specifically, we achieved prefill and\ndecoding speeds of 121 token/s and 14 token/s for ChatGLM2 6B, and 330 token/s\nand 30 token/s for smaller Gemma 2B, respectively. Compared with CPU-based\nFastLLM and GPU-based MLC-LLM, our engine attains over 10x speedup for the\nprefill speed and 2~3x speedup for the decoding speed.\n","authors":["Luchang Li","Sheng Qian","Jie Lu","Lunxi Yuan","Rui Wang","Qin Xie"],"pdf_url":"https://arxiv.org/pdf/2403.20041v3.pdf","comment":"21 pages, 6 figures, fix \"E0M4\" spell mistake, fix FLOPS to TFLOPS"},{"id":"http://arxiv.org/abs/2407.04295v1","updated":"2024-07-05T06:57:30Z","published":"2024-07-05T06:57:30Z","title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey","summary":" Large Language Models (LLMs) have performed exceptionally in various\ntext-generative tasks, including question answering, translation, code\ncompletion, etc. However, the over-assistance of LLMs has raised the challenge\nof \"jailbreaking\", which induces the model to generate malicious responses\nagainst the usage policy and society by designing adversarial prompts. With the\nemergence of jailbreak attack methods exploiting different vulnerabilities in\nLLMs, the corresponding safety alignment measures are also evolving. In this\npaper, we propose a comprehensive and detailed taxonomy of jailbreak attack and\ndefense methods. For instance, the attack methods are divided into black-box\nand white-box attacks based on the transparency of the target model. Meanwhile,\nwe classify defense methods into prompt-level and model-level defenses.\nAdditionally, we further subdivide these attack and defense methods into\ndistinct sub-classes and present a coherent diagram illustrating their\nrelationships. We also conduct an investigation into the current evaluation\nmethods and compare them from different perspectives. Our findings aim to\ninspire future research and practical implementations in safeguarding LLMs\nagainst adversarial attacks. Above all, although jailbreak remains a\nsignificant concern within the community, we believe that our work enhances the\nunderstanding of this domain and provides a foundation for developing more\nsecure LLMs.\n","authors":["Sibo Yi","Yule Liu","Zhen Sun","Tianshuo Cong","Xinlei He","Jiaxing Song","Ke Xu","Qi Li"],"pdf_url":"https://arxiv.org/pdf/2407.04295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04293v1","updated":"2024-07-05T06:54:27Z","published":"2024-07-05T06:54:27Z","title":"Systematic Evaluation of Online Speaker Diarization Systems Regarding\n their Latency","summary":" In this paper, different online speaker diarization systems are evaluated on\nthe same hardware with the same test data with regard to their latency. The\nlatency is the time span from audio input to the output of the corresponding\nspeaker label. As part of the evaluation, various model combinations within the\nDIART framework, a diarization system based on the online clustering algorithm\nUIS-RNN-SML, and the end-to-end online diarization system FS-EEND are compared.\nThe lowest latency is achieved for the DIART-pipeline with the embedding model\npyannote/embedding and the segmentation model pyannote/segmentation. The\nFS-EEND system shows a similarly good latency. In general there is currently no\npublished research that compares several online diarization systems in terms of\ntheir latency. This makes this work even more relevant.\n","authors":["Roman Aperdannier","Sigurd Schacht","Alexander Piazza"],"pdf_url":"https://arxiv.org/pdf/2407.04293v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2407.04280v1","updated":"2024-07-05T06:25:54Z","published":"2024-07-05T06:25:54Z","title":"LearnerVoice: A Dataset of Non-Native English Learners' Spontaneous\n Speech","summary":" Prevalent ungrammatical expressions and disfluencies in spontaneous speech\nfrom second language (L2) learners pose unique challenges to Automatic Speech\nRecognition (ASR) systems. However, few datasets are tailored to L2 learner\nspeech. We publicly release LearnerVoice, a dataset consisting of 50.04 hours\nof audio and transcriptions of L2 learners' spontaneous speech. Our linguistic\nanalysis reveals that transcriptions in our dataset contain L2S (L2 learner's\nSpontaneous speech) features, consisting of ungrammatical expressions and\ndisfluencies (e.g., filler words, word repetitions, self-repairs, false\nstarts), significantly more than native speech datasets. Fine-tuning\nwhisper-small.en with LearnerVoice achieves a WER of 10.26%, 44.2% lower than\nvanilla whisper-small.en. Furthermore, our qualitative analysis indicates that\n54.2% of errors from the vanilla model on LearnerVoice are attributable to L2S\nfeatures, with 48.1% of them being reduced in the fine-tuned model.\n","authors":["Haechan Kim","Junho Myung","Seoyoung Kim","Sungpah Lee","Dongyeop Kang","Juho Kim"],"pdf_url":"https://arxiv.org/pdf/2407.04280v1.pdf","comment":"Accepted for INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2407.04279v1","updated":"2024-07-05T06:25:34Z","published":"2024-07-05T06:25:34Z","title":"BiosERC: Integrating Biography Speakers Supported by LLMs for ERC Tasks","summary":" In the Emotion Recognition in Conversation task, recent investigations have\nutilized attention mechanisms exploring relationships among utterances from\nintra- and inter-speakers for modeling emotional interaction between them.\nHowever, attributes such as speaker personality traits remain unexplored and\npresent challenges in terms of their applicability to other tasks or\ncompatibility with diverse model architectures. Therefore, this work introduces\na novel framework named BiosERC, which investigates speaker characteristics in\na conversation. By employing Large Language Models (LLMs), we extract the\n\"biographical information\" of the speaker within a conversation as\nsupplementary knowledge injected into the model to classify emotional labels\nfor each utterance. Our proposed method achieved state-of-the-art (SOTA)\nresults on three famous benchmark datasets: IEMOCAP, MELD, and EmoryNLP,\ndemonstrating the effectiveness and generalization of our model and showcasing\nits potential for adaptation to various conversation analysis tasks. Our source\ncode is available at https://github.com/yingjie7/BiosERC.\n","authors":["Jieying Xue","Minh Phuong Nguyen","Blake Matheny","Le Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.04279v1.pdf","comment":"Accepted in the 33rd International Conference on Artificial Neural\n Networks (ICANN 2024)"},{"id":"http://arxiv.org/abs/2406.10118v2","updated":"2024-07-05T05:28:20Z","published":"2024-06-14T15:23:39Z","title":"SEACrowd: A Multilingual Multimodal Data Hub and Benchmark Suite for\n Southeast Asian Languages","summary":" Southeast Asia (SEA) is a region rich in linguistic diversity and cultural\nvariety, with over 1,300 indigenous languages and a population of 671 million\npeople. However, prevailing AI models suffer from a significant lack of\nrepresentation of texts, images, and audio datasets from SEA, compromising the\nquality of AI models for SEA languages. Evaluating models for SEA languages is\nchallenging due to the scarcity of high-quality datasets, compounded by the\ndominance of English training data, raising concerns about potential cultural\nmisrepresentation. To address these challenges, we introduce SEACrowd, a\ncollaborative initiative that consolidates a comprehensive resource hub that\nfills the resource gap by providing standardized corpora in nearly 1,000 SEA\nlanguages across three modalities. Through our SEACrowd benchmarks, we assess\nthe quality of AI models on 36 indigenous languages across 13 tasks, offering\nvaluable insights into the current AI landscape in SEA. Furthermore, we propose\nstrategies to facilitate greater AI advancements, maximizing potential utility\nand resource equity for the future of AI in SEA.\n","authors":["Holy Lovenia","Rahmad Mahendra","Salsabil Maulana Akbar","Lester James V. Miranda","Jennifer Santoso","Elyanah Aco","Akhdan Fadhilah","Jonibek Mansurov","Joseph Marvin Imperial","Onno P. Kampman","Joel Ruben Antony Moniz","Muhammad Ravi Shulthan Habibi","Frederikus Hudi","Railey Montalan","Ryan Ignatius","Joanito Agili Lopo","William Nixon","Börje F. Karlsson","James Jaya","Ryandito Diandaru","Yuze Gao","Patrick Amadeus","Bin Wang","Jan Christian Blaise Cruz","Chenxi Whitehouse","Ivan Halim Parmonangan","Maria Khelli","Wenyu Zhang","Lucky Susanto","Reynard Adha Ryanda","Sonny Lazuardi Hermawan","Dan John Velasco","Muhammad Dehan Al Kautsar","Willy Fitra Hendria","Yasmin Moslem","Noah Flynn","Muhammad Farid Adilazuarda","Haochen Li","Johanes Lee","R. Damanhuri","Shuo Sun","Muhammad Reza Qorib","Amirbek Djanibekov","Wei Qi Leong","Quyet V. Do","Niklas Muennighoff","Tanrada Pansuwan","Ilham Firdausi Putra","Yan Xu","Ngee Chia Tai","Ayu Purwarianti","Sebastian Ruder","William Tjhi","Peerat Limkonchotiwat","Alham Fikri Aji","Sedrick Keh","Genta Indra Winata","Ruochen Zhang","Fajri Koto","Zheng-Xin Yong","Samuel Cahyawijaya"],"pdf_url":"https://arxiv.org/pdf/2406.10118v2.pdf","comment":"https://github.com/SEACrowd"},{"id":"http://arxiv.org/abs/2406.11012v4","updated":"2024-07-05T05:18:00Z","published":"2024-06-16T17:10:32Z","title":"Connecting the Dots: Evaluating Abstract Reasoning Capabilities of LLMs\n Using the New York Times Connections Word Game","summary":" The New York Times Connections game has emerged as a popular and challenging\npursuit for word puzzle enthusiasts. We collect 200 Connections games to\nevaluate the performance of state-of-the-art large language models (LLMs)\nagainst expert and novice human players. Our results show that even the\nbest-performing LLM, GPT-4o, which has otherwise shown impressive reasoning\nabilities on a wide variety of benchmarks, can only fully solve 8% of the\ngames. Compared to GPT-4o, novice and expert players perform better, with\nexpert human players significantly outperforming GPT-4o. To deepen our\nunderstanding we create a taxonomy of the knowledge types required to\nsuccessfully categorize words in the Connections game, revealing that LLMs\nstruggle with associative, encyclopedic, and linguistic knowledge. Our findings\nestablish the New York Times Connections game as a challenging benchmark for\nevaluating abstract reasoning capabilities in humans and AI systems.\n","authors":["Prisha Samadarshi","Mariam Mustafa","Anushka Kulkarni","Raven Rothkopf","Tuhin Chakrabarty","Smaranda Muresan"],"pdf_url":"https://arxiv.org/pdf/2406.11012v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12045v2","updated":"2024-07-05T04:57:54Z","published":"2024-04-18T09:58:51Z","title":"RAM: Towards an Ever-Improving Memory System by Learning from\n Communications","summary":" We introduce an innovative RAG-based framework with an ever-improving memory.\nInspired by humans'pedagogical process, RAM utilizes recursively\nreasoning-based retrieval and experience reflections to continually update the\nmemory and learn from users' communicative feedback, namely communicative\nlearning. Extensive experiments with both simulated and real users demonstrate\nsignificant improvements over traditional RAG and self-knowledge methods,\nparticularly excelling in handling false premise and multi-hop questions.\nFurthermore, RAM exhibits promising adaptability to various feedback and\nretrieval methods, showcasing its potential for advancing AI capabilities in\ndynamic knowledge acquisition and lifelong learning.\n","authors":["Jiaqi Li","Xiaobo Wang","Wentao Ding","Zihao Wang","Yipeng Kang","Zixia Jia","Zilong Zheng"],"pdf_url":"https://arxiv.org/pdf/2404.12045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04251v1","updated":"2024-07-05T04:38:17Z","published":"2024-07-05T04:38:17Z","title":"Unified Interpretation of Smoothing Methods for Negative Sampling Loss\n Functions in Knowledge Graph Embedding","summary":" Knowledge Graphs (KGs) are fundamental resources in knowledge-intensive tasks\nin NLP. Due to the limitation of manually creating KGs, KG Completion (KGC) has\nan important role in automatically completing KGs by scoring their links with\nKG Embedding (KGE). To handle many entities in training, KGE relies on Negative\nSampling (NS) loss that can reduce the computational cost by sampling. Since\nthe appearance frequencies for each link are at most one in KGs, sparsity is an\nessential and inevitable problem. The NS loss is no exception. As a solution,\nthe NS loss in KGE relies on smoothing methods like Self-Adversarial Negative\nSampling (SANS) and subsampling. However, it is uncertain what kind of\nsmoothing method is suitable for this purpose due to the lack of theoretical\nunderstanding. This paper provides theoretical interpretations of the smoothing\nmethods for the NS loss in KGE and induces a new NS loss, Triplet Adaptive\nNegative Sampling (TANS), that can cover the characteristics of the\nconventional smoothing methods. Experimental results of TransE, DistMult,\nComplEx, RotatE, HAKE, and HousE on FB15k-237, WN18RR, and YAGO3-10 datasets\nand their sparser subsets show the soundness of our interpretation and\nperformance improvement by our TANS.\n","authors":["Xincan Feng","Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.04251v1.pdf","comment":"9 pages, 4 figures, 2 tables; accepted to workshop RepL4NLP held in\n conjunction with ACL 2024"},{"id":"http://arxiv.org/abs/2407.04247v1","updated":"2024-07-05T04:28:46Z","published":"2024-07-05T04:28:46Z","title":"ArAIEval Shared Task: Propagandistic Techniques Detection in Unimodal\n and Multimodal Arabic Content","summary":" We present an overview of the second edition of the ArAIEval shared task,\norganized as part of the ArabicNLP 2024 conference co-located with ACL 2024. In\nthis edition, ArAIEval offers two tasks: (i) detection of propagandistic\ntextual spans with persuasion techniques identification in tweets and news\narticles, and (ii) distinguishing between propagandistic and non-propagandistic\nmemes. A total of 14 teams participated in the final evaluation phase, with 6\nand 9 teams participating in Tasks 1 and 2, respectively. Finally, 11 teams\nsubmitted system description papers. Across both tasks, we observed that\nfine-tuning transformer models such as AraBERT was at the core of the majority\nof the participating systems. We provide a description of the task setup,\nincluding a description of the dataset construction and the evaluation setup.\nWe further provide a brief overview of the participating systems. All datasets\nand evaluation scripts are released to the research community\n(https://araieval.gitlab.io/). We hope this will enable further research on\nthese important tasks in Arabic.\n","authors":["Maram Hasanain","Md. Arid Hasan","Fatema Ahmed","Reem Suwaileh","Md. Rafiul Biswas","Wajdi Zaghouani","Firoj Alam"],"pdf_url":"https://arxiv.org/pdf/2407.04247v1.pdf","comment":"propaganda, span detection, disinformation, misinformation, fake\n news, LLMs, GPT-4, multimodality, multimodal LLMs"},{"id":"http://arxiv.org/abs/2403.05023v2","updated":"2024-07-05T04:10:06Z","published":"2024-03-08T03:55:27Z","title":"Towards Multimodal Sentiment Analysis Debiasing via Bias Purification","summary":" Multimodal Sentiment Analysis (MSA) aims to understand human intentions by\nintegrating emotion-related clues from diverse modalities, such as visual,\nlanguage, and audio. Unfortunately, the current MSA task invariably suffers\nfrom unplanned dataset biases, particularly multimodal utterance-level label\nbias and word-level context bias. These harmful biases potentially mislead\nmodels to focus on statistical shortcuts and spurious correlations, causing\nsevere performance bottlenecks. To alleviate these issues, we present a\nMultimodal Counterfactual Inference Sentiment (MCIS) analysis framework based\non causality rather than conventional likelihood. Concretely, we first\nformulate a causal graph to discover harmful biases from already-trained\nvanilla models. In the inference phase, given a factual multimodal input, MCIS\nimagines two counterfactual scenarios to purify and mitigate these biases.\nThen, MCIS can make unbiased decisions from biased observations by comparing\nfactual and counterfactual outcomes. We conduct extensive experiments on\nseveral standard MSA benchmarks. Qualitative and quantitative results show the\neffectiveness of the proposed framework.\n","authors":["Dingkang Yang","Mingcheng Li","Dongling Xiao","Yang Liu","Kun Yang","Zhaoyu Chen","Yuzheng Wang","Peng Zhai","Ke Li","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05023v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.01906v2","updated":"2024-07-05T03:23:59Z","published":"2024-07-02T03:11:13Z","title":"Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning for\n Sparse Architectural Large Language Models","summary":" Parameter-efficient fine-tuning (PEFT) is crucial for customizing Large\nLanguage Models (LLMs) with constrained resources. Although there have been\nvarious PEFT methods for dense-architecture LLMs, PEFT for sparse-architecture\nLLMs is still underexplored. In this work, we study the PEFT method for LLMs\nwith the Mixture-of-Experts (MoE) architecture and the contents of this work\nare mainly threefold: (1) We investigate the dispersion degree of the activated\nexperts in customized tasks, and found that the routing distribution for a\nspecific task tends to be highly concentrated, while the distribution of\nactivated experts varies significantly across different tasks. (2) We propose\nExpert-Specialized Fine-Tuning, or ESFT, which tunes the experts most relevant\nto downstream tasks while freezing the other experts and modules; experimental\nresults demonstrate that our method not only improves the tuning efficiency,\nbut also matches or even surpasses the performance of full-parameter\nfine-tuning. (3) We further analyze the impact of the MoE architecture on\nexpert-specialized fine-tuning. We find that MoE models with finer-grained\nexperts are more advantageous in selecting the combination of experts that are\nmost relevant to downstream tasks, thereby enhancing both the training\nefficiency and effectiveness. Our code is available at\nhttps://github.com/deepseek-ai/ESFT.\n","authors":["Zihan Wang","Deli Chen","Damai Dai","Runxin Xu","Zhuoshu Li","Y. Wu"],"pdf_url":"https://arxiv.org/pdf/2407.01906v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01695v2","updated":"2024-07-05T02:49:47Z","published":"2024-01-24T03:11:36Z","title":"Language-Guided World Models: A Model-Based Approach to AI Control","summary":" This paper introduces the concept of Language-Guided World Models (LWMs) --\nprobabilistic models that can simulate environments by reading texts. Agents\nequipped with these models provide humans with more extensive and efficient\ncontrol, allowing them to simultaneously alter agent behaviors in multiple\ntasks via natural verbal communication. In this work, we take initial steps in\ndeveloping robust LWMs that can generalize to compositionally novel language\ndescriptions. We design a challenging world modeling benchmark based on the\ngame of MESSENGER (Hanjie et al., 2021), featuring evaluation settings that\nrequire varying degrees of compositional generalization. Our experiments reveal\nthe lack of generalizability of the state-of-the-art Transformer model, as it\noffers marginal improvements in simulation quality over a no-text baseline. We\ndevise a more robust model by fusing the Transformer with the EMMA attention\nmechanism (Hanjie et al., 2021). Our model substantially outperforms the\nTransformer and approaches the performance of a model with an oracle semantic\nparsing and grounding capability. To demonstrate the practicality of this model\nin improving AI safety and transparency, we simulate a scenario in which the\nmodel enables an agent to present plans to a human before execution, and to\nrevise plans based on their language feedback.\n","authors":["Alex Zhang","Khanh Nguyen","Jens Tuyls","Albert Lin","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2402.01695v2.pdf","comment":"SpLU-RoboNLP workshop at ACL 2024"},{"id":"http://arxiv.org/abs/2406.18069v3","updated":"2024-07-05T01:25:13Z","published":"2024-06-26T04:54:45Z","title":"Large Language Models for Cuffless Blood Pressure Measurement From\n Wearable Biosignals","summary":" Large language models (LLMs) have captured significant interest from both\nacademia and industry due to their impressive performance across various\ntextual tasks. However, the potential of LLMs to analyze physiological\ntime-series data remains an emerging research field. Particularly, there is a\nnotable gap in the utilization of LLMs for analyzing wearable biosignals to\nachieve cuffless blood pressure (BP) measurement, which is critical for the\nmanagement of cardiovascular diseases. This paper presents the first work to\nexplore the capacity of LLMs to perform cuffless BP estimation based on\nwearable biosignals. We extracted physiological features from electrocardiogram\n(ECG) and photoplethysmogram (PPG) signals and designed context-enhanced\nprompts by combining these features with BP domain knowledge and user\ninformation. Subsequently, we adapted LLMs to BP estimation tasks through\nfine-tuning. To evaluate the proposed approach, we conducted assessments of ten\nadvanced LLMs using a comprehensive public dataset of wearable biosignals from\n1,272 participants. The experimental results demonstrate that the optimally\nfine-tuned LLM significantly surpasses conventional task-specific baselines,\nachieving an estimation error of 0.00 $\\pm$ 9.25 mmHg for systolic BP and 1.29\n$\\pm$ 6.37 mmHg for diastolic BP. Notably, the ablation studies highlight the\nbenefits of our context enhancement strategy, leading to an 8.9% reduction in\nmean absolute error for systolic BP estimation. This paper pioneers the\nexploration of LLMs for cuffless BP measurement, providing a potential solution\nto enhance the accuracy of cuffless BP measurement.\n","authors":["Zengding Liu","Chen Chen","Jiannong Cao","Minglei Pan","Jikui Liu","Nan Li","Fen Miao","Ye Li"],"pdf_url":"https://arxiv.org/pdf/2406.18069v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10426v2","updated":"2024-07-05T00:59:45Z","published":"2024-02-16T03:24:56Z","title":"DELL: Generating Reactions and Explanations for LLM-Based Misinformation\n Detection","summary":" Large language models are limited by challenges in factuality and\nhallucinations to be directly employed off-the-shelf for judging the veracity\nof news articles, where factual accuracy is paramount. In this work, we propose\nDELL that identifies three key stages in misinformation detection where LLMs\ncould be incorporated as part of the pipeline: 1) LLMs could \\emph{generate\nnews reactions} to represent diverse perspectives and simulate user-news\ninteraction networks; 2) LLMs could \\emph{generate explanations} for proxy\ntasks (e.g., sentiment, stance) to enrich the contexts of news articles and\nproduce experts specializing in various aspects of news understanding; 3) LLMs\ncould \\emph{merge task-specific experts} and provide an overall prediction by\nincorporating the predictions and confidence scores of varying experts.\nExtensive experiments on seven datasets with three LLMs demonstrate that DELL\noutperforms state-of-the-art baselines by up to 16.8\\% in macro f1-score.\nFurther analysis reveals that the generated reactions and explanations are\ngreatly helpful in misinformation detection, while our proposed LLM-guided\nexpert merging helps produce better-calibrated predictions.\n","authors":["Herun Wan","Shangbin Feng","Zhaoxuan Tan","Heng Wang","Yulia Tsvetkov","Minnan Luo"],"pdf_url":"https://arxiv.org/pdf/2402.10426v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2407.04699v1","updated":"2024-07-05T17:59:58Z","published":"2024-07-05T17:59:58Z","title":"LaRa: Efficient Large-Baseline Radiance Fields","summary":" Radiance field methods have achieved photorealistic novel view synthesis and\ngeometry reconstruction. But they are mostly applied in per-scene optimization\nor small-baseline settings. While several recent works investigate feed-forward\nreconstruction with large baselines by utilizing transformers, they all operate\nwith a standard global attention mechanism and hence ignore the local nature of\n3D reconstruction. We propose a method that unifies local and global reasoning\nin transformer layers, resulting in improved quality and faster convergence.\nOur model represents scenes as Gaussian Volumes and combines this with an image\nencoder and Group Attention Layers for efficient feed-forward reconstruction.\nExperimental results demonstrate that our model, trained for two days on four\nGPUs, demonstrates high fidelity in reconstructing 360° radiance fields, and\nrobustness to zero-shot and out-of-domain testing.\n","authors":["Anpei Chen","Haofei Xu","Stefano Esposito","Siyu Tang","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2407.04699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14868v2","updated":"2024-07-05T17:59:57Z","published":"2024-05-23T17:59:52Z","title":"Generative Camera Dolly: Extreme Monocular Dynamic Novel View Synthesis","summary":" Accurate reconstruction of complex dynamic scenes from just a single\nviewpoint continues to be a challenging task in computer vision. Current\ndynamic novel view synthesis methods typically require videos from many\ndifferent camera viewpoints, necessitating careful recording setups, and\nsignificantly restricting their utility in the wild as well as in terms of\nembodied AI applications. In this paper, we propose $\\textbf{GCD}$, a\ncontrollable monocular dynamic view synthesis pipeline that leverages\nlarge-scale diffusion priors to, given a video of any scene, generate a\nsynchronous video from any other chosen perspective, conditioned on a set of\nrelative camera pose parameters. Our model does not require depth as input, and\ndoes not explicitly model 3D scene geometry, instead performing end-to-end\nvideo-to-video translation in order to achieve its goal efficiently. Despite\nbeing trained on synthetic multi-view video data only, zero-shot real-world\ngeneralization experiments show promising results in multiple domains,\nincluding robotics, object permanence, and driving environments. We believe our\nframework can potentially unlock powerful applications in rich dynamic scene\nunderstanding, perception for robotics, and interactive 3D video viewing\nexperiences for virtual reality.\n","authors":["Basile Van Hoorick","Rundi Wu","Ege Ozguroglu","Kyle Sargent","Ruoshi Liu","Pavel Tokmakov","Achal Dave","Changxi Zheng","Carl Vondrick"],"pdf_url":"https://arxiv.org/pdf/2405.14868v2.pdf","comment":"Accepted to ECCV 2024. Project webpage is available at:\n https://gcd.cs.columbia.edu/"},{"id":"http://arxiv.org/abs/2407.04697v1","updated":"2024-07-05T17:59:02Z","published":"2024-07-05T17:59:02Z","title":"VCoME: Verbal Video Composition with Multimodal Editing Effects","summary":" Verbal videos, featuring voice-overs or text overlays, provide valuable\ncontent but present significant challenges in composition, especially when\nincorporating editing effects to enhance clarity and visual appeal. In this\npaper, we introduce the novel task of verbal video composition with editing\neffects. This task aims to generate coherent and visually appealing verbal\nvideos by integrating multimodal editing effects across textual, visual, and\naudio categories. To achieve this, we curate a large-scale dataset of video\neffects compositions from publicly available sources. We then formulate this\ntask as a generative problem, involving the identification of appropriate\npositions in the verbal content and the recommendation of editing effects for\nthese positions. To address this task, we propose VCoME, a general framework\nthat employs a large multimodal model to generate editing effects for video\ncomposition. Specifically, VCoME takes in the multimodal video context and\nautoregressively outputs where to apply effects within the verbal content and\nwhich effects are most appropriate for each position. VCoME also supports\nprompt-based control of composition density and style, providing substantial\nflexibility for diverse applications. Through extensive quantitative and\nqualitative evaluations, we clearly demonstrate the effectiveness of VCoME. A\ncomprehensive user study shows that our method produces videos of professional\nquality while being 85$\\times$ more efficient than professional editors.\n","authors":["Weibo Gong","Xiaojie Jin","Xin Li","Dongliang He","Xinglong Wu"],"pdf_url":"https://arxiv.org/pdf/2407.04697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04689v1","updated":"2024-07-05T17:50:38Z","published":"2024-07-05T17:50:38Z","title":"RAM: Retrieval-Based Affordance Transfer for Generalizable Zero-Shot\n Robotic Manipulation","summary":" This work proposes a retrieve-and-transfer framework for zero-shot robotic\nmanipulation, dubbed RAM, featuring generalizability across various objects,\nenvironments, and embodiments. Unlike existing approaches that learn\nmanipulation from expensive in-domain demonstrations, RAM capitalizes on a\nretrieval-based affordance transfer paradigm to acquire versatile manipulation\ncapabilities from abundant out-of-domain data. First, RAM extracts unified\naffordance at scale from diverse sources of demonstrations including robotic\ndata, human-object interaction (HOI) data, and custom data to construct a\ncomprehensive affordance memory. Then given a language instruction, RAM\nhierarchically retrieves the most similar demonstration from the affordance\nmemory and transfers such out-of-domain 2D affordance to in-domain 3D\nexecutable affordance in a zero-shot and embodiment-agnostic manner. Extensive\nsimulation and real-world evaluations demonstrate that our RAM consistently\noutperforms existing works in diverse daily tasks. Additionally, RAM shows\nsignificant potential for downstream applications such as automatic and\nefficient data collection, one-shot visual imitation, and LLM/VLM-integrated\nlong-horizon manipulation. For more details, please check our website at\nhttps://yxkryptonite.github.io/RAM/.\n","authors":["Yuxuan Kuang","Junjie Ye","Haoran Geng","Jiageng Mao","Congyue Deng","Leonidas Guibas","He Wang","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04688v1","updated":"2024-07-05T17:50:35Z","published":"2024-07-05T17:50:35Z","title":"Enhancing Vehicle Re-identification and Matching for Weaving Analysis","summary":" Vehicle weaving on highways contributes to traffic congestion, raises safety\nissues, and underscores the need for sophisticated traffic management systems.\nCurrent tools are inadequate in offering precise and comprehensive data on\nlane-specific weaving patterns. This paper introduces an innovative method for\ncollecting non-overlapping video data in weaving zones, enabling the generation\nof quantitative insights into lane-specific weaving behaviors. Our experimental\nresults confirm the efficacy of this approach, delivering critical data that\ncan assist transportation authorities in enhancing traffic control and roadway\ninfrastructure.\n","authors":["Mei Qiu","Wei Lin","Stanley Chien","Lauren Christopher","Yaobin Chen","Shu Hu"],"pdf_url":"https://arxiv.org/pdf/2407.04688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04687v1","updated":"2024-07-05T17:50:30Z","published":"2024-07-05T17:50:30Z","title":"Embracing Massive Medical Data","summary":" As massive medical data become available with an increasing number of scans,\nexpanding classes, and varying sources, prevalent training paradigms -- where\nAI is trained with multiple passes over fixed, finite datasets -- face\nsignificant challenges. First, training AI all at once on such massive data is\nimpractical as new scans/sources/classes continuously arrive. Second, training\nAI continuously on new scans/sources/classes can lead to catastrophic\nforgetting, where AI forgets old data as it learns new data, and vice versa. To\naddress these two challenges, we propose an online learning method that enables\ntraining AI from massive medical data. Instead of repeatedly training AI on\nrandomly selected data samples, our method identifies the most significant\nsamples for the current AI model based on their data uniqueness and prediction\nuncertainty, then trains the AI on these selective data samples. Compared with\nprevalent training paradigms, our method not only improves data efficiency by\nenabling training on continual data streams, but also mitigates catastrophic\nforgetting by selectively training AI on significant data samples that might\notherwise be forgotten, outperforming by 15% in Dice score for multi-organ and\ntumor segmentation.\n The code is available at https://github.com/MrGiovanni/OnlineLearning\n","authors":["Yu-Cheng Chou","Zongwei Zhou","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2407.04687v1.pdf","comment":"Accepted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.04683v1","updated":"2024-07-05T17:44:08Z","published":"2024-07-05T17:44:08Z","title":"Efficient Betti Matching Enables Topology-Aware 3D Segmentation via\n Persistent Homology","summary":" In this work, we propose an efficient algorithm for the calculation of the\nBetti matching, which can be used as a loss function to train topology aware\nsegmentation networks. Betti matching loss builds on techniques from\ntopological data analysis, specifically persistent homology. A major challenge\nis the computational cost of computing persistence barcodes. In response to\nthis challenge, we propose a new, highly optimized implementation of Betti\nmatching, implemented in C++ together with a python interface, which achieves\nsignificant speedups compared to the state-of-the-art implementation Cubical\nRipser. We use Betti matching 3D to train segmentation networks with the Betti\nmatching loss and demonstrate improved topological correctness of predicted\nsegmentations across several datasets. The source code is available at\nhttps://github.com/nstucki/Betti-Matching-3D.\n","authors":["Nico Stucki","Vincent Bürgin","Johannes C. Paetzold","Ulrich Bauer"],"pdf_url":"https://arxiv.org/pdf/2407.04683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04681v1","updated":"2024-07-05T17:43:30Z","published":"2024-07-05T17:43:30Z","title":"Rethinking Visual Prompting for Multimodal Large Language Models with\n External Knowledge","summary":" In recent years, multimodal large language models (MLLMs) have made\nsignificant strides by training on vast high-quality image-text datasets,\nenabling them to generally understand images well. However, the inherent\ndifficulty in explicitly conveying fine-grained or spatially dense information\nin text, such as masks, poses a challenge for MLLMs, limiting their ability to\nanswer questions requiring an understanding of detailed or localized visual\nelements. Drawing inspiration from the Retrieval-Augmented Generation (RAG)\nconcept, this paper proposes a new visual prompt approach to integrate\nfine-grained external knowledge, gleaned from specialized vision models (e.g.,\ninstance segmentation/OCR models), into MLLMs. This is a promising yet\nunderexplored direction for enhancing MLLMs' performance. Our approach diverges\nfrom concurrent works, which transform external knowledge into additional text\nprompts, necessitating the model to indirectly learn the correspondence between\nvisual content and text coordinates. Instead, we propose embedding fine-grained\nknowledge information directly into a spatial embedding map as a visual prompt.\nThis design can be effortlessly incorporated into various MLLMs, such as LLaVA\nand Mipha, considerably improving their visual understanding performance.\nThrough rigorous experiments, we demonstrate that our method can enhance MLLM\nperformance across nine benchmarks, amplifying their fine-grained context-aware\ncapabilities.\n","authors":["Yuanze Lin","Yunsheng Li","Dongdong Chen","Weijian Xu","Ronald Clark","Philip Torr","Lu Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.04681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04676v1","updated":"2024-07-05T17:39:03Z","published":"2024-07-05T17:39:03Z","title":"Is plantar thermography a valid digital biomarker for characterising\n diabetic foot ulceration risk?","summary":" Background: In the absence of prospective data on diabetic foot ulcers (DFU),\ncross-sectional associations with causal risk factors (peripheral neuropathy,\nand peripheral arterial disease (PAD)) could be used to establish the validity\nof plantar thermography for DFU risk stratification.\n Methods: First, we investigated the associations between the intrinsic\nclusters of plantar thermographic images with several DFU risk factors using an\nunsupervised deep-learning framework. We then studied associations between\nobtained thermography clusters and DFU risk factors. Second, to identify those\nassociations with predictive power, we used supervised learning to train\nConvolutional Neural Network (CNN) regression/classification models that\npredicted the risk factor based on the thermograph (and visual) input.\n Findings: Our dataset comprised 282 thermographs from type 2 diabetes\nmellitus patients (aged 56.31 +- 9.18 years, 51.42 % males). On clustering, we\nfound two overlapping clusters (silhouette score = 0.10, indicating weak\nseparation). There was strong evidence for associations between assigned\nclusters and several factors related to diabetic foot ulceration such as\nperipheral neuropathy, PAD, number of diabetes complications, and composite DFU\nrisk prediction scores such as Martins-Mendes, PODUS-2020, and SIGN. However,\nmodels predicting said risk factors had poor performances.\n Interpretation: The strong associations between intrinsic thermography\nclusters and several DFU risk factors support the validity of using\nthermography for characterising DFU risk. However, obtained associations did\nnot prove to be predictive, likely due to, spectrum bias, or because\nthermography and classical risk factors characterise incompletely overlapping\nportions of the DFU risk construct. Our findings highlight the challenges in\nstandardising ground truths when defining novel digital biomarkers.\n","authors":["Akshay Jagadeesh","Chanchanok Aramrat","Aqsha Nur","Poppy Mallinson","Sanjay Kinra"],"pdf_url":"https://arxiv.org/pdf/2407.04676v1.pdf","comment":"13 pages, 2 Figures, 1 Table. Supplementary files and link to code to\n be uploaded"},{"id":"http://arxiv.org/abs/2407.04663v1","updated":"2024-07-05T17:18:46Z","published":"2024-07-05T17:18:46Z","title":"Unsupervised 4D Cardiac Motion Tracking with Spatiotemporal Optical Flow\n Networks","summary":" Cardiac motion tracking from echocardiography can be used to estimate and\nquantify myocardial motion within a cardiac cycle. It is a cost-efficient and\neffective approach for assessing myocardial function. However, ultrasound\nimaging has the inherent characteristics of spatially low resolution and\ntemporally random noise, which leads to difficulties in obtaining reliable\nannotation. Thus it is difficult to perform supervised learning for motion\ntracking. In addition, there is no end-to-end unsupervised method currently in\nthe literature. This paper presents a motion tracking method where unsupervised\noptical flow networks are designed with spatial reconstruction loss and\ntemporal-consistency loss. Our proposed loss functions make use of the\npair-wise and temporal correlation to estimate cardiac motion from noisy\nbackground. Experiments using a synthetic 4D echocardiography dataset has shown\nthe effectiveness of our approach, and its superiority over existing methods on\nboth accuracy and running speed. To the best of our knowledge, this is the\nfirst work performed that uses unsupervised end-to-end deep learning optical\nflow network for 4D cardiac motion tracking.\n","authors":["Long Teng","Wei Feng","Menglong Zhu","Xinchao Li"],"pdf_url":"https://arxiv.org/pdf/2407.04663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01864v2","updated":"2024-07-05T17:17:48Z","published":"2024-07-02T00:43:41Z","title":"Research on target detection method of distracted driving behavior based\n on improved YOLOv8","summary":" With the development of deep learning technology, the detection and\nclassification of distracted driving behaviour requires higher accuracy.\nExisting deep learning-based methods are computationally intensive and\nparameter redundant, limiting the efficiency and accuracy in practical\napplications. To solve this problem, this study proposes an improved YOLOv8\ndetection method based on the original YOLOv8 model by integrating the BoTNet\nmodule, GAM attention mechanism and EIoU loss function. By optimising the\nfeature extraction and multi-scale feature fusion strategies, the training and\ninference processes are simplified, and the detection accuracy and efficiency\nare significantly improved. Experimental results show that the improved model\nperforms well in both detection speed and accuracy, with an accuracy rate of\n99.4%, and the model is smaller and easy to deploy, which is able to identify\nand classify distracted driving behaviours in real time, provide timely\nwarnings, and enhance driving safety.\n","authors":["Shiquan Shen","Zhizhong Wu","Pan Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01864v2.pdf","comment":"Major revision on content, no replacement available soon"},{"id":"http://arxiv.org/abs/2407.04651v1","updated":"2024-07-05T17:07:25Z","published":"2024-07-05T17:07:25Z","title":"SAM Fewshot Finetuning for Anatomical Segmentation in Medical Images","summary":" We propose a straightforward yet highly effective few-shot fine-tuning\nstrategy for adapting the Segment Anything (SAM) to anatomical segmentation\ntasks in medical images. Our novel approach revolves around reformulating the\nmask decoder within SAM, leveraging few-shot embeddings derived from a limited\nset of labeled images (few-shot collection) as prompts for querying anatomical\nobjects captured in image embeddings. This innovative reformulation greatly\nreduces the need for time-consuming online user interactions for labeling\nvolumetric images, such as exhaustively marking points and bounding boxes to\nprovide prompts slice by slice. With our method, users can manually segment a\nfew 2D slices offline, and the embeddings of these annotated image regions\nserve as effective prompts for online segmentation tasks. Our method\nprioritizes the efficiency of the fine-tuning process by exclusively training\nthe mask decoder through caching mechanisms while keeping the image encoder\nfrozen. Importantly, this approach is not limited to volumetric medical images,\nbut can generically be applied to any 2D/3D segmentation task. To thoroughly\nevaluate our method, we conducted extensive validation on four datasets,\ncovering six anatomical segmentation tasks across two modalities. Furthermore,\nwe conducted a comparative analysis of different prompting options within SAM\nand the fully-supervised nnU-Net. The results demonstrate the superior\nperformance of our method compared to SAM employing only point prompts\n(approximately 50% improvement in IoU) and performs on-par with fully\nsupervised methods whilst reducing the requirement of labeled data by at least\nan order of magnitude.\n","authors":["Weiyi Xie","Nathalie Willems","Shubham Patil","Yang Li","Mayank Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.04651v1.pdf","comment":"9 pages, Proceedings of the IEEE/CVF Winter Conference on\n Applications of Computer Vision. 2024"},{"id":"http://arxiv.org/abs/2403.06459v2","updated":"2024-07-05T17:02:33Z","published":"2024-03-11T06:46:31Z","title":"From Pixel to Cancer: Cellular Automata in Computed Tomography","summary":" AI for cancer detection encounters the bottleneck of data scarcity,\nannotation difficulty, and low prevalence of early tumors. Tumor synthesis\nseeks to create artificial tumors in medical images, which can greatly\ndiversify the data and annotations for AI training. However, current tumor\nsynthesis approaches are not applicable across different organs due to their\nneed for specific expertise and design. This paper establishes a set of generic\nrules to simulate tumor development. Each cell (pixel) is initially assigned a\nstate between zero and ten to represent the tumor population, and a tumor can\nbe developed based on three rules to describe the process of growth, invasion,\nand death. We apply these three generic rules to simulate tumor\ndevelopment--from pixel to cancer--using cellular automata. We then integrate\nthe tumor state into the original computed tomography (CT) images to generate\nsynthetic tumors across different organs. This tumor synthesis approach allows\nfor sampling tumors at multiple stages and analyzing tumor-organ interaction.\nClinically, a reader study involving three expert radiologists reveals that the\nsynthetic tumors and their developing trajectories are convincingly realistic.\nTechnically, we analyze and simulate tumor development at various stages using\n9,262 raw, unlabeled CT images sourced from 68 hospitals worldwide. The\nperformance in segmenting tumors in the liver, pancreas, and kidneys exceeds\nprevailing literature benchmarks, underlining the immense potential of tumor\nsynthesis, especially for earlier cancer detection.\n The code and models are available at\nhttps://github.com/MrGiovanni/Pixel2Cancer\n","authors":["Yuxiang Lai","Xiaoxi Chen","Angtian Wang","Alan Yuille","Zongwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.06459v2.pdf","comment":"Early accepted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2404.04687v2","updated":"2024-07-05T16:58:15Z","published":"2024-04-06T17:23:43Z","title":"Z-Splat: Z-Axis Gaussian Splatting for Camera-Sonar Fusion","summary":" Differentiable 3D-Gaussian splatting (GS) is emerging as a prominent\ntechnique in computer vision and graphics for reconstructing 3D scenes. GS\nrepresents a scene as a set of 3D Gaussians with varying opacities and employs\na computationally efficient splatting operation along with analytical\nderivatives to compute the 3D Gaussian parameters given scene images captured\nfrom various viewpoints. Unfortunately, capturing surround view ($360^{\\circ}$\nviewpoint) images is impossible or impractical in many real-world imaging\nscenarios, including underwater imaging, rooms inside a building, and\nautonomous navigation. In these restricted baseline imaging scenarios, the GS\nalgorithm suffers from a well-known 'missing cone' problem, which results in\npoor reconstruction along the depth axis. In this manuscript, we demonstrate\nthat using transient data (from sonars) allows us to address the missing cone\nproblem by sampling high-frequency data along the depth axis. We extend the\nGaussian splatting algorithms for two commonly used sonars and propose fusion\nalgorithms that simultaneously utilize RGB camera data and sonar data. Through\nsimulations, emulations, and hardware experiments across various imaging\nscenarios, we show that the proposed fusion algorithms lead to significantly\nbetter novel view synthesis (5 dB improvement in PSNR) and 3D geometry\nreconstruction (60% lower Chamfer distance).\n","authors":["Ziyuan Qu","Omkar Vengurlekar","Mohamad Qadri","Kevin Zhang","Michael Kaess","Christopher Metzler","Suren Jayasuriya","Adithya Pediredla"],"pdf_url":"https://arxiv.org/pdf/2404.04687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04638v1","updated":"2024-07-05T16:49:21Z","published":"2024-07-05T16:49:21Z","title":"Semi-Supervised Segmentation via Embedding Matching","summary":" Deep convolutional neural networks are widely used in medical image\nsegmentation but require many labeled images for training. Annotating\nthree-dimensional medical images is a time-consuming and costly process. To\novercome this limitation, we propose a novel semi-supervised segmentation\nmethod that leverages mostly unlabeled images and a small set of labeled images\nin training. Our approach involves assessing prediction uncertainty to identify\nreliable predictions on unlabeled voxels from the teacher model. These voxels\nserve as pseudo-labels for training the student model. In voxels where the\nteacher model produces unreliable predictions, pseudo-labeling is carried out\nbased on voxel-wise embedding correspondence using reference voxels from\nlabeled images. We applied this method to automate hip bone segmentation in CT\nimages, achieving notable results with just 4 CT scans. The proposed approach\nyielded a Hausdorff distance with 95th percentile (HD95) of 3.30 and IoU of\n0.929, surpassing existing methods achieving HD95 (4.07) and IoU (0.927) at\ntheir best.\n","authors":["Weiyi Xie","Nathalie Willems","Nikolas Lessmann","Tom Gibbons","Daniele De Massari"],"pdf_url":"https://arxiv.org/pdf/2407.04638v1.pdf","comment":"13 pages, MIDL2024 oral"},{"id":"http://arxiv.org/abs/2407.04621v1","updated":"2024-07-05T16:27:00Z","published":"2024-07-05T16:27:00Z","title":"OneRestore: A Universal Restoration Framework for Composite Degradation","summary":" In real-world scenarios, image impairments often manifest as composite\ndegradations, presenting a complex interplay of elements such as low light,\nhaze, rain, and snow. Despite this reality, existing restoration methods\ntypically target isolated degradation types, thereby falling short in\nenvironments where multiple degrading factors coexist. To bridge this gap, our\nstudy proposes a versatile imaging model that consolidates four physical\ncorruption paradigms to accurately represent complex, composite degradation\nscenarios. In this context, we propose OneRestore, a novel transformer-based\nframework designed for adaptive, controllable scene restoration. The proposed\nframework leverages a unique cross-attention mechanism, merging degraded scene\ndescriptors with image features, allowing for nuanced restoration. Our model\nallows versatile input scene descriptors, ranging from manual text embeddings\nto automatic extractions based on visual attributes. Our methodology is further\nenhanced through a composite degradation restoration loss, using extra degraded\nimages as negative samples to fortify model constraints. Comparative results on\nsynthetic and real-world datasets demonstrate OneRestore as a superior\nsolution, significantly advancing the state-of-the-art in addressing complex,\ncomposite degradations.\n","authors":["Yu Guo","Yuan Gao","Yuxu Lu","Huilin Zhu","Ryan Wen Liu","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2407.04621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04619v1","updated":"2024-07-05T16:20:48Z","published":"2024-07-05T16:20:48Z","title":"CountGD: Multi-Modal Open-World Counting","summary":" The goal of this paper is to improve the generality and accuracy of\nopen-vocabulary object counting in images. To improve the generality, we\nrepurpose an open-vocabulary detection foundation model (GroundingDINO) for the\ncounting task, and also extend its capabilities by introducing modules to\nenable specifying the target object to count by visual exemplars. In turn,\nthese new capabilities - being able to specify the target object by\nmulti-modalites (text and exemplars) - lead to an improvement in counting\naccuracy.\n We make three contributions: First, we introduce the first open-world\ncounting model, CountGD, where the prompt can be specified by a text\ndescription or visual exemplars or both; Second, we show that the performance\nof the model significantly improves the state of the art on multiple counting\nbenchmarks - when using text only, CountGD is comparable to or outperforms all\nprevious text-only works, and when using both text and visual exemplars, we\noutperform all previous models; Third, we carry out a preliminary study into\ndifferent interactions between the text and visual exemplar prompts, including\nthe cases where they reinforce each other and where one restricts the other.\nThe code and an app to test the model are available at\nhttps://www.robots.ox.ac.uk/~vgg/research/countgd/.\n","authors":["Niki Amini-Naieni","Tengda Han","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2407.04619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04616v1","updated":"2024-07-05T16:14:53Z","published":"2024-07-05T16:14:53Z","title":"Isomorphic Pruning for Vision Models","summary":" Structured pruning reduces the computational overhead of deep neural networks\nby removing redundant sub-structures. However, assessing the relative\nimportance of different sub-structures remains a significant challenge,\nparticularly in advanced vision models featuring novel mechanisms and\narchitectures like self-attention, depth-wise convolutions, or residual\nconnections. These heterogeneous substructures usually exhibit diverged\nparameter scales, weight distributions, and computational topology, introducing\nconsiderable difficulty to importance comparison. To overcome this, we present\nIsomorphic Pruning, a simple approach that demonstrates effectiveness across a\nrange of network architectures such as Vision Transformers and CNNs, and\ndelivers competitive performance across different model sizes. Isomorphic\nPruning originates from an observation that, when evaluated under a pre-defined\nimportance criterion, heterogeneous sub-structures demonstrate significant\ndivergence in their importance distribution, as opposed to isomorphic\nstructures that present similar importance patterns. This inspires us to\nperform isolated ranking and comparison on different types of sub-structures\nfor more reliable pruning. Our empirical results on ImageNet-1K demonstrate\nthat Isomorphic Pruning surpasses several pruning baselines dedicatedly\ndesigned for Transformers or CNNs. For instance, we improve the accuracy of\nDeiT-Tiny from 74.52% to 77.50% by pruning an off-the-shelf DeiT-Base model.\nAnd for ConvNext-Tiny, we enhanced performance from 82.06% to 82.18%, while\nreducing the number of parameters and memory usage. Code is available at\n\\url{https://github.com/VainF/Isomorphic-Pruning}.\n","authors":["Gongfan Fang","Xinyin Ma","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13216v2","updated":"2024-07-05T16:07:13Z","published":"2023-12-20T17:35:24Z","title":"Improving Semantic Correspondence with Viewpoint-Guided Spherical Maps","summary":" Recent progress in self-supervised representation learning has resulted in\nmodels that are capable of extracting image features that are not only\neffective at encoding image level, but also pixel-level, semantics. These\nfeatures have been shown to be effective for dense visual semantic\ncorrespondence estimation, even outperforming fully-supervised methods.\nNevertheless, current self-supervised approaches still fail in the presence of\nchallenging image characteristics such as symmetries and repeated parts. To\naddress these limitations, we propose a new approach for semantic\ncorrespondence estimation that supplements discriminative self-supervised\nfeatures with 3D understanding via a weak geometric spherical prior. Compared\nto more involved 3D pipelines, our model only requires weak viewpoint\ninformation, and the simplicity of our spherical representation enables us to\ninject informative geometric priors into the model during training. We propose\na new evaluation metric that better accounts for repeated part and\nsymmetry-induced mistakes. We present results on the challenging SPair-71k\ndataset, where we show that our approach demonstrates is capable of\ndistinguishing between symmetric views and repeated parts across many object\ncategories, and also demonstrate that we can generalize to unseen classes on\nthe AwA dataset.\n","authors":["Octave Mariotti","Oisin Mac Aodha","Hakan Bilen"],"pdf_url":"https://arxiv.org/pdf/2312.13216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08514v3","updated":"2024-07-05T16:03:27Z","published":"2023-05-15T10:23:14Z","title":"Generative Adversarial Networks for Spatio-Spectral Compression of\n Hyperspectral Images","summary":" The development of deep learning-based models for the compression of\nhyperspectral images (HSIs) has recently attracted great attention in remote\nsensing due to the sharp growing of hyperspectral data archives. Most of the\nexisting models achieve either spectral or spatial compression, and do not\njointly consider the spatio-spectral redundancies present in HSIs. To address\nthis problem, in this paper we focus our attention on the High Fidelity\nCompression (HiFiC) model (which is proven to be highly effective for spatial\ncompression problems) and adapt it to perform spatio-spectral compression of\nHSIs. In detail, we introduce two new models: i) HiFiC using Squeeze and\nExcitation (SE) blocks (denoted as HiFiC$_{SE}$); and ii) HiFiC with 3D\nconvolutions (denoted as HiFiC$_{3D}$) in the framework of compression of HSIs.\nWe analyze the effectiveness of HiFiC$_{SE}$ and HiFiC$_{3D}$ in compressing\nthe spatio-spectral redundancies with channel attention and inter-dependency\nanalysis. Experimental results show the efficacy of the proposed models in\nperforming spatio-spectral compression, while reconstructing images at reduced\nbitrates with higher reconstruction quality. The code of the proposed models is\npublicly available at https://git.tu-berlin.de/rsim/HSI-SSC .\n","authors":["Martin Hermann Paul Fuchs","Akshara Preethy Byju","Alisa Walda","Behnood Rasti","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2305.08514v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16204v2","updated":"2024-07-05T15:59:24Z","published":"2023-12-23T11:10:43Z","title":"Learning from Mistakes: Iterative Prompt Relabeling for Text-to-Image\n Diffusion Model Training","summary":" Diffusion models have shown impressive performance in many domains, including\nimage generation, time series prediction, and reinforcement learning. The\nalgorithm demonstrates superior performance over the traditional GAN and\ntransformer-based methods. However, the model's capability to follow natural\nlanguage instructions (e.g., spatial relationships between objects, generating\ncomplex scenes) is still unsatisfactory. It has been an important research area\nto enhance such capability. Prior works have shown that using Reinforcement\nLearning can effectively train diffusion models to enhance fidelity on specific\nobjectives. However, existing RL methods require collecting a large amount of\ndata to train an effective reward model. They also don't receive feedback when\nthe generated image is incorrect. In this work, we propose Iterative Prompt\nRelabeling (IPR), a novel algorithm that aligns images to text through\niterative image sampling and prompt relabeling. IPR first samples a batch of\nimages conditioned on the text then relabels the text prompts of unmatched\ntext-image pairs with classifier feedback. We conduct thorough experiments on\nSDv2 and SDXL, testing their capability to follow instructions on spatial\nrelations. With IPR, we improved up to 15.22% (absolute improvement) on the\nchallenging spatial relation VISOR benchmark, demonstrating superior\nperformance compared to previous RL methods.\n","authors":["Xinyan Chen","Jiaxin Ge","Tianjun Zhang","Jiaming Liu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.16204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04604v1","updated":"2024-07-05T15:53:04Z","published":"2024-07-05T15:53:04Z","title":"PartCraft: Crafting Creative Objects by Parts","summary":" This paper propels creative control in generative visual AI by allowing users\nto \"select\". Departing from traditional text or sketch-based methods, we for\nthe first time allow users to choose visual concepts by parts for their\ncreative endeavors. The outcome is fine-grained generation that precisely\ncaptures selected visual concepts, ensuring a holistically faithful and\nplausible result. To achieve this, we first parse objects into parts through\nunsupervised feature clustering. Then, we encode parts into text tokens and\nintroduce an entropy-based normalized attention loss that operates on them.\nThis loss design enables our model to learn generic prior topology knowledge\nabout object's part composition, and further generalize to novel part\ncompositions to ensure the generation looks holistically faithful. Lastly, we\nemploy a bottleneck encoder to project the part tokens. This not only enhances\nfidelity but also accelerates learning, by leveraging shared knowledge and\nfacilitating information exchange among instances. Visual results in the paper\nand supplementary material showcase the compelling power of PartCraft in\ncrafting highly customized, innovative creations, exemplified by the \"charming\"\nand creative birds. Code is released at https://github.com/kamwoh/partcraft.\n","authors":["Kam Woh Ng","Xiatian Zhu","Yi-Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2407.04604v1.pdf","comment":"ECCV 2024. arXiv admin note: substantial text overlap with\n arXiv:2311.15477"},{"id":"http://arxiv.org/abs/2407.04603v1","updated":"2024-07-05T15:52:23Z","published":"2024-07-05T15:52:23Z","title":"AWT: Transferring Vision-Language Models via Augmentation, Weighting,\n and Transportation","summary":" Pre-trained vision-language models (VLMs) have shown impressive results in\nvarious visual classification tasks. However, we often fail to fully unleash\ntheir potential when adapting them for new concept understanding due to limited\ninformation on new classes. To address this limitation, we introduce a novel\nadaptation framework, AWT (Augment, Weight, then Transport). AWT comprises\nthree key components: augmenting inputs with diverse visual perspectives and\nenriched class descriptions through image transformations and language models;\ndynamically weighting inputs based on the prediction entropy; and employing\noptimal transport to mine semantic correlations in the vision-language space.\nAWT can be seamlessly integrated into various VLMs, enhancing their zero-shot\ncapabilities without additional training and facilitating few-shot learning\nthrough an integrated multimodal adapter module. We verify AWT in multiple\nchallenging scenarios, including zero-shot and few-shot image classification,\nzero-shot video action recognition, and out-of-distribution generalization. AWT\nconsistently outperforms the state-of-the-art methods in each setting. In\naddition, our extensive studies further demonstrate AWT's effectiveness and\nadaptability across different VLMs, architectures, and scales.\n","authors":["Yuhan Zhu","Yuyang Ji","Zhiyu Zhao","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05330v2","updated":"2024-07-05T15:50:10Z","published":"2023-10-09T01:23:08Z","title":"A Lightweight Video Anomaly Detection Model with Weak Supervision and\n Adaptive Instance Selection","summary":" Video anomaly detection is to determine whether there are any abnormal\nevents, behaviors or objects in a given video, which enables effective and\nintelligent public safety management. As video anomaly labeling is both\ntime-consuming and expensive, most existing works employ unsupervised or weakly\nsupervised learning methods. This paper focuses on weakly supervised video\nanomaly detection, in which the training videos are labeled whether or not they\ncontain any anomalies, but there is no information about which frames the\nanomalies are located. However, the uncertainty of weakly labeled data and the\nlarge model size prevent existing methods from wide deployment in real\nscenarios, especially the resource-limit situations such as edge-computing. In\nthis paper, we develop a lightweight video anomaly detection model. On the one\nhand, we propose an adaptive instance selection strategy, which is based on the\nmodel's current status to select confident instances, thereby mitigating the\nuncertainty of weakly labeled data and subsequently promoting the model's\nperformance. On the other hand, we design a lightweight multi-level temporal\ncorrelation attention module and an hourglass-shaped fully connected layer to\nconstruct the model, which can reduce the model parameters to only 0.56\\% of\nthe existing methods (e.g. RTFM). Our extensive experiments on two public\ndatasets UCF-Crime and ShanghaiTech show that our model can achieve comparable\nor even superior AUC score compared to the state-of-the-art methods, with a\nsignificantly reduced number of model parameters.\n","authors":["Yang Wang","Jiaogen Zhou","Jihong Guan"],"pdf_url":"https://arxiv.org/pdf/2310.05330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04597v1","updated":"2024-07-05T15:44:53Z","published":"2024-07-05T15:44:53Z","title":"Feature Attenuation of Defective Representation Can Resolve Incomplete\n Masking on Anomaly Detection","summary":" In unsupervised anomaly detection (UAD) research, while state-of-the-art\nmodels have reached a saturation point with extensive studies on public\nbenchmark datasets, they adopt large-scale tailor-made neural networks (NN) for\ndetection performance or pursued unified models for various tasks. Towards edge\ncomputing, it is necessary to develop a computationally efficient and scalable\nsolution that avoids large-scale complex NNs. Motivated by this, we aim to\noptimize the UAD performance with minimal changes to NN settings. Thus, we\nrevisit the reconstruction-by-inpainting approach and rethink to improve it by\nanalyzing strengths and weaknesses. The strength of the SOTA methods is a\nsingle deterministic masking approach that addresses the challenges of random\nmultiple masking that is inference latency and output inconsistency.\nNevertheless, the issue of failure to provide a mask to completely cover\nanomalous regions is a remaining weakness. To mitigate this issue, we propose\nFeature Attenuation of Defective Representation (FADeR) that only employs two\nMLP layers which attenuates feature information of anomaly reconstruction\nduring decoding. By leveraging FADeR, features of unseen anomaly patterns are\nreconstructed into seen normal patterns, reducing false alarms. Experimental\nresults demonstrate that FADeR achieves enhanced performance compared to\nsimilar-scale NNs. Furthermore, our approach exhibits scalability in\nperformance enhancement when integrated with other single deterministic masking\nmethods in a plug-and-play manner.\n","authors":["YeongHyeon Park","Sungho Kang","Myung Jin Kim","Hyeong Seok Kim","Juneho Yi"],"pdf_url":"https://arxiv.org/pdf/2407.04597v1.pdf","comment":"11 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.13658v3","updated":"2024-07-05T15:42:25Z","published":"2024-03-20T15:06:49Z","title":"Multimodal Variational Autoencoder for Low-cost Cardiac Hemodynamics\n Instability Detection","summary":" Recent advancements in non-invasive detection of cardiac hemodynamic\ninstability (CHDI) primarily focus on applying machine learning techniques to a\nsingle data modality, e.g. cardiac magnetic resonance imaging (MRI). Despite\ntheir potential, these approaches often fall short especially when the size of\nlabeled patient data is limited, a common challenge in the medical domain.\nFurthermore, only a few studies have explored multimodal methods to study CHDI,\nwhich mostly rely on costly modalities such as cardiac MRI and echocardiogram.\nIn response to these limitations, we propose a novel multimodal variational\nautoencoder ($\\text{CardioVAE}_\\text{X,G}$) to integrate low-cost chest X-ray\n(CXR) and electrocardiogram (ECG) modalities with pre-training on a large\nunlabeled dataset. Specifically, $\\text{CardioVAE}_\\text{X,G}$ introduces a\nnovel tri-stream pre-training strategy to learn both shared and\nmodality-specific features, thus enabling fine-tuning with both unimodal and\nmultimodal datasets. We pre-train $\\text{CardioVAE}_\\text{X,G}$ on a large,\nunlabeled dataset of $50,982$ subjects from a subset of MIMIC database and then\nfine-tune the pre-trained model on a labeled dataset of $795$ subjects from the\nASPIRE registry. Comprehensive evaluations against existing methods show that\n$\\text{CardioVAE}_\\text{X,G}$ offers promising performance (AUROC $=0.79$ and\nAccuracy $=0.77$), representing a significant step forward in non-invasive\nprediction of CHDI. Our model also excels in producing fine interpretations of\npredictions directly associated with clinical features, thereby supporting\nclinical decision-making.\n","authors":["Mohammod N. I. Suvon","Prasun C. Tripathi","Wenrui Fan","Shuo Zhou","Xianyuan Liu","Samer Alabed","Venet Osmani","Andrew J. Swift","Chen Chen","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2403.13658v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04592v1","updated":"2024-07-05T15:40:39Z","published":"2024-07-05T15:40:39Z","title":"Smell and Emotion: Recognising emotions in smell-related artworks","summary":" Emotions and smell are underrepresented in digital art history. In this\nexploratory work, we show that recognising emotions from smell-related artworks\nis technically feasible but has room for improvement. Using style transfer and\nhyperparameter optimization we achieve a minor performance boost and open up\nthe field for future extensions.\n","authors":["Vishal Patoliya","Mathias Zinnen","Andreas Maier","Vincent Christlein"],"pdf_url":"https://arxiv.org/pdf/2407.04592v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.04590v1","updated":"2024-07-05T15:40:11Z","published":"2024-07-05T15:40:11Z","title":"SH17: A Dataset for Human Safety and Personal Protective Equipment\n Detection in Manufacturing Industry","summary":" Workplace accidents continue to pose significant risks for human safety,\nparticularly in industries such as construction and manufacturing, and the\nnecessity for effective Personal Protective Equipment (PPE) compliance has\nbecome increasingly paramount. Our research focuses on the development of\nnon-invasive techniques based on the Object Detection (OD) and Convolutional\nNeural Network (CNN) to detect and verify the proper use of various types of\nPPE such as helmets, safety glasses, masks, and protective clothing. This study\nproposes the SH17 Dataset, consisting of 8,099 annotated images containing\n75,994 instances of 17 classes collected from diverse industrial environments,\nto train and validate the OD models. We have trained state-of-the-art OD models\nfor benchmarking, and initial results demonstrate promising accuracy levels\nwith You Only Look Once (YOLO)v9-e model variant exceeding 70.9% in PPE\ndetection. The performance of the model validation on cross-domain datasets\nsuggests that integrating these technologies can significantly improve safety\nmanagement systems, providing a scalable and efficient solution for industries\nstriving to meet human safety regulations and protect their workforce. The\ndataset is available at https://github.com/ahmadmughees/sh17dataset.\n","authors":["Hafiz Mughees Ahmad","Afshin Rahimi"],"pdf_url":"https://arxiv.org/pdf/2407.04590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02584v2","updated":"2024-07-05T15:37:15Z","published":"2024-05-30T20:48:10Z","title":"Planetary Causal Inference: Implications for the Geography of Poverty","summary":" Earth observation data such as satellite imagery can, when combined with\nmachine learning, can have far-reaching impacts on our understanding of the\ngeography of poverty through the prediction of living conditions, especially\nwhere government-derived economic indicators are either unavailable or\npotentially untrustworthy. Recent work has progressed in using Earth\nObservation (EO) data not only to predict spatial economic outcomes but also to\nexplore cause and effect, an understanding which is critical for downstream\npolicy analysis. In this review, we first document the growth of interest in\nusing satellite images together with EO data in causal analysis. We then trace\nthe relationship between spatial statistics and machine learning methods before\ndiscussing four ways in which EO data has been used in causal machine learning\npipelines -- (1.) poverty outcome imputation for downstream causal analysis,\n(2.) EO image deconfounding, (3.) EO-based treatment effect heterogeneity, and\n(4.) EO-based transportability analysis. We conclude by providing a\nstep-by-step workflow for how researchers can incorporate EO data in causal ML\nanalysis going forward, outlining major choices of data, models, and evaluation\nmetrics.\n","authors":["Kazuki Sakamoto","Connor T. Jerzak","Adel Daoud"],"pdf_url":"https://arxiv.org/pdf/2406.02584v2.pdf","comment":"For a full list of the papers found in the quantitative literature\n search, see https://github.com/AIandGlobalDevelopmentLab/eo-poverty-review"},{"id":"http://arxiv.org/abs/2407.04587v1","updated":"2024-07-05T15:32:07Z","published":"2024-07-05T15:32:07Z","title":"Multimodal Classification via Modal-Aware Interactive Enhancement","summary":" Due to the notorious modality imbalance problem, multimodal learning (MML)\nleads to the phenomenon of optimization imbalance, thus struggling to achieve\nsatisfactory performance. Recently, some representative methods have been\nproposed to boost the performance, mainly focusing on adaptive adjusting the\noptimization of each modality to rebalance the learning speed of dominant and\nnon-dominant modalities. To better facilitate the interaction of model\ninformation in multimodal learning, in this paper, we propose a novel\nmultimodal learning method, called modal-aware interactive enhancement (MIE).\nSpecifically, we first utilize an optimization strategy based on sharpness\naware minimization (SAM) to smooth the learning objective during the forward\nphase. Then, with the help of the geometry property of SAM, we propose a\ngradient modification strategy to impose the influence between different\nmodalities during the backward phase. Therefore, we can improve the\ngeneralization ability and alleviate the modality forgetting phenomenon\nsimultaneously for multimodal learning. Extensive experiments on widely used\ndatasets demonstrate that our proposed method can outperform various\nstate-of-the-art baselines to achieve the best performance.\n","authors":["Qing-Yuan Jiang","Zhouyang Chi","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.04587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18119v2","updated":"2024-07-05T15:23:58Z","published":"2024-05-28T12:28:12Z","title":"Low-Resource Crop Classification from Multi-Spectral Time Series Using\n Lossless Compressors","summary":" Deep learning has significantly improved the accuracy of crop classification\nusing multispectral temporal data. However, these models have complex\nstructures with numerous parameters, requiring large amounts of data and costly\ntraining. In low-resource situations with fewer labeled samples, deep learning\nmodels perform poorly due to insufficient data. Conversely, compressors are\ndata-type agnostic, and non-parametric methods do not bring underlying\nassumptions. Inspired by this insight, we propose a non-training alternative to\ndeep learning models, aiming to address these situations. Specifically, the\nSymbolic Representation Module is proposed to convert the reflectivity into\nsymbolic representations. The symbolic representations are then\ncross-transformed in both the channel and time dimensions to generate symbolic\nembeddings. Next, the Multi-scale Normalised Compression Distance (MNCD) is\ndesigned to measure the correlation between any two symbolic embeddings.\nFinally, based on the MNCDs, high quality crop classification can be achieved\nusing only a k-nearest-neighbor classifier kNN. The entire framework is\nready-to-use and lightweight. Without any training, it outperformed, on\naverage, 7 advanced deep learning models trained at scale on three benchmark\ndatasets. It also outperforms more than half of these models in the few-shot\nsetting with sparse crop labels. Therefore, the high performance and robustness\nof our non-training framework makes it truly applicable to real-world crop\nmapping. Codes are available at:\nhttps://github.com/qinfengsama/Compressor-Based-Crop-Mapping.\n","authors":["Wei Cheng","Hongrui Ye","Xiao Wen","Jiachen Zhang","Jiping Xu","Feifan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.18119v2.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.03305v2","updated":"2024-07-05T14:54:59Z","published":"2024-07-03T17:47:59Z","title":"Advanced Smart City Monitoring: Real-Time Identification of Indian\n Citizen Attributes","summary":" This project focuses on creating a smart surveillance system for Indian\ncities that can identify and analyze people's attributes in real time. Using\nadvanced technologies like artificial intelligence and machine learning, the\nsystem can recognize attributes such as upper body color, what the person is\nwearing, accessories they are wearing, headgear, etc., and analyze behavior\nthrough cameras installed around the city.\n","authors":["Shubham Kale","Shashank Sharma","Abhilash Khuntia"],"pdf_url":"https://arxiv.org/pdf/2407.03305v2.pdf","comment":"6 pages , 8 figure , changed title and some alignment issue were\n resolved, but other contents remains same"},{"id":"http://arxiv.org/abs/2401.09673v3","updated":"2024-07-05T14:51:55Z","published":"2024-01-18T01:18:59Z","title":"Artwork Protection Against Neural Style Transfer Using Locally Adaptive\n Adversarial Color Attack","summary":" Neural style transfer (NST) generates new images by combining the style of\none image with the content of another. However, unauthorized NST can exploit\nartwork, raising concerns about artists' rights and motivating the development\nof proactive protection methods. We propose Locally Adaptive Adversarial Color\nAttack (LAACA), empowering artists to protect their artwork from unauthorized\nstyle transfer by processing before public release. By delving into the\nintricacies of human visual perception and the role of different frequency\ncomponents, our method strategically introduces frequency-adaptive\nperturbations in the image. These perturbations significantly degrade the\ngeneration quality of NST while maintaining an acceptable level of visual\nchange in the original image, ensuring that potential infringers are\ndiscouraged from using the protected artworks, because of its bad NST\ngeneration quality. Additionally, existing metrics often overlook the\nimportance of color fidelity in evaluating color-mattered tasks, such as the\nquality of NST-generated images, which is crucial in the context of artistic\nworks. To comprehensively assess the color-mattered tasks, we propose the\nAdversarial Color Distance Metric (ACDM), designed to quantify the color\ndifference of images pre- and post-manipulations. Experimental results confirm\nthat attacking NST using LAACA results in visually inferior style transfer, and\nthe ACDM can efficiently measure color-mattered tasks. By providing artists\nwith a tool to safeguard their intellectual property, our work relieves the\nsocio-technical challenges posed by the misuse of NST in the art community.\n","authors":["Zhongliang Guo","Junhao Dong","Yifei Qian","Kaixuan Wang","Weiye Li","Ziheng Guo","Yuheng Wang","Yanli Li","Ognjen Arandjelović","Lei Fang"],"pdf_url":"https://arxiv.org/pdf/2401.09673v3.pdf","comment":"9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.00636v2","updated":"2024-07-05T14:48:52Z","published":"2024-03-01T16:16:51Z","title":"Graph Theory and GNNs to Unravel the Topographical Organization of Brain\n Lesions in Variants of Alzheimer's Disease Progression","summary":" In this study, we proposed and evaluated a graph-based framework to assess\nvariations in Alzheimer's disease (AD) neuropathologies, focusing on classic\n(cAD) and rapid (rpAD) progression forms. Histopathological images are\nconverted into tau-pathology-based (i.e., amyloid plaques and tau tangles)\ngraphs, and derived metrics are used in a machine-learning classifier. This\nclassifier incorporates SHAP value explainability to differentiate between cAD\nand rpAD. Furthermore, we tested graph neural networks (GNNs) to extract\ntopological embeddings from the graphs and use them in classifying the\nprogression forms of AD. The analysis demonstrated denser networks in rpAD and\na distinctive impact on brain cortical layers: rpAD predominantly affects\nmiddle layers, whereas cAD influences both superficial and deep layers of the\nsame cortical regions. These results suggest a unique neuropathological network\norganization for each AD variant.\n","authors":["Gabriel Jimenez","Leopold Hebert-Stevens","Benoit Delatour","Lev Stimmer","Daniel Racoceanu"],"pdf_url":"https://arxiv.org/pdf/2403.00636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04560v1","updated":"2024-07-05T14:48:19Z","published":"2024-07-05T14:48:19Z","title":"Real Time Emotion Analysis Using Deep Learning for Education,\n Entertainment, and Beyond","summary":" The significance of emotion detection is increasing in education,\nentertainment, and various other domains. We are developing a system that can\nidentify and transform facial expressions into emojis to provide immediate\nfeedback.The project consists of two components. Initially, we will employ\nsophisticated image processing techniques and neural networks to construct a\ndeep learning model capable of precisely categorising facial expressions. Next,\nwe will develop a basic application that records live video using the camera on\nyour device. The app will utilise a sophisticated model to promptly analyse\nfacial expressions and promptly exhibit corresponding emojis.Our objective is\nto develop a dynamic tool that integrates deep learning and real-time video\nprocessing for the purposes of online education, virtual events, gaming, and\nenhancing user experience. This tool enhances interactions and introduces novel\nemotional intelligence technologies.\n","authors":["Abhilash Khuntia","Shubham Kale"],"pdf_url":"https://arxiv.org/pdf/2407.04560v1.pdf","comment":"8 pages, 23 figures"},{"id":"http://arxiv.org/abs/2407.04559v1","updated":"2024-07-05T14:48:15Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n than Measuring Coherence, Grounding, and Repetition","summary":" Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04545v1","updated":"2024-07-05T14:30:24Z","published":"2024-07-05T14:30:24Z","title":"Gaussian Eigen Models for Human Heads","summary":" We present personalized Gaussian Eigen Models (GEMs) for human heads, a novel\nmethod that compresses dynamic 3D Gaussians into low-dimensional linear spaces.\nOur approach is inspired by the seminal work of Blanz and Vetter, where a\nmesh-based 3D morphable model (3DMM) is constructed from registered meshes.\nBased on dynamic 3D Gaussians, we create a lower-dimensional representation of\nprimitives that applies to most 3DGS head avatars. Specifically, we propose a\nuniversal method to distill the appearance of a mesh-controlled UNet Gaussian\navatar using an ensemble of linear eigenbasis. We replace heavy CNN-based\narchitectures with a single linear layer improving speed and enabling a range\nof real-time downstream applications. To create a particular facial expression,\none simply needs to perform a dot product between the eigen coefficients and\nthe distilled basis. This efficient method removes the requirement for an input\nmesh during testing, enhancing simplicity and speed in expression generation.\nThis process is highly efficient and supports real-time rendering on everyday\ndevices, leveraging the effectiveness of standard Gaussian Splatting. In\naddition, we demonstrate how the GEM can be controlled using a ResNet-based\nregression architecture. We show and compare self-reenactment and cross-person\nreenactment to state-of-the-art 3D avatar methods, demonstrating higher quality\nand better control. A real-time demo showcases the applicability of the GEM\nrepresentation.\n","authors":["Wojciech Zielonka","Timo Bolkart","Thabo Beeler","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2407.04545v1.pdf","comment":"https://zielon.github.io/gem/"},{"id":"http://arxiv.org/abs/2407.04542v1","updated":"2024-07-05T14:29:12Z","published":"2024-07-05T14:29:12Z","title":"Rethinking Image Compression on the Web with Generative AI","summary":" The rapid growth of the Internet, driven by social media, web browsing, and\nvideo streaming, has made images central to the Web experience, resulting in\nsignificant data transfer and increased webpage sizes. Traditional image\ncompression methods, while reducing bandwidth, often degrade image quality.\nThis paper explores a novel approach using generative AI to reconstruct images\nat the edge or client-side. We develop a framework that leverages text prompts\nand provides additional conditioning inputs like Canny edges and color palettes\nto a text-to-image model, achieving up to 99.8% bandwidth savings in the best\ncases and 92.6% on average, while maintaining high perceptual similarity.\nEmpirical analysis and a user study show that our method preserves image\nmeaning and structure more effectively than traditional compression methods,\noffering a promising solution for reducing bandwidth usage and improving\nInternet affordability with minimal degradation in image quality.\n","authors":["Shayan Ali Hassan","Danish Humair","Ihsan Ayyub Qazi","Zafar Ayyub Qazi"],"pdf_url":"https://arxiv.org/pdf/2407.04542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04538v1","updated":"2024-07-05T14:24:37Z","published":"2024-07-05T14:24:37Z","title":"PDiscoFormer: Relaxing Part Discovery Constraints with Vision\n Transformers","summary":" Computer vision methods that explicitly detect object parts and reason on\nthem are a step towards inherently interpretable models. Existing approaches\nthat perform part discovery driven by a fine-grained classification task make\nvery restrictive assumptions on the geometric properties of the discovered\nparts; they should be small and compact. Although this prior is useful in some\ncases, in this paper we show that pre-trained transformer-based vision models,\nsuch as self-supervised DINOv2 ViT, enable the relaxation of these constraints.\nIn particular, we find that a total variation (TV) prior, which allows for\nmultiple connected components of any size, substantially outperforms previous\nwork. We test our approach on three fine-grained classification benchmarks:\nCUB, PartImageNet and Oxford Flowers, and compare our results to previously\npublished methods as well as a re-implementation of the state-of-the-art method\nPDiscoNet with a transformer-based backbone. We consistently obtain substantial\nimprovements across the board, both on part discovery metrics and the\ndownstream classification task, showing that the strong inductive biases in\nself-supervised ViT models require to rethink the geometric priors that can be\nused for unsupervised part discovery.\n","authors":["Ananthu Aniraj","Cassio F. Dantas","Dino Ienco","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2407.04538v1.pdf","comment":"Accepted as a main conference paper at the European Conference of\n Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2407.04519v1","updated":"2024-07-05T14:04:25Z","published":"2024-07-05T14:04:25Z","title":"Success or Failure? Analyzing Segmentation Refinement with Few-Shot\n Segmentation","summary":" The purpose of segmentation refinement is to enhance the initial coarse masks\ngenerated by segmentation algorithms. The refined masks are expected to capture\nthe details and contours of the target objects. Research on segmentation\nrefinement has developed as a response to the need for high-quality initial\nmasks. However, to our knowledge, no method has been developed that can\ndetermine the success of segmentation refinement. Such a method could ensure\nthe reliability of segmentation in applications where the outcome of the\nsegmentation is important, and fosters innovation in image processing\ntechnologies. To address this research gap, we propose JFS~(Judging From\nSupport-set), a method to identify the success of segmentation refinement\nleveraging a few-shot segmentation (FSS) model. The traditional goal of the\nproblem in FSS is to find a target object in a query image utilizing target\ninformation given by a support set. However, in our proposed method, we use the\nFSS network in a novel way to assess the segmentation refinement. When there\nare two masks, a coarse mask and a refined mask from segmentation refinement,\nthese two masks become support masks. The existing support mask works as a\nground truth mask to judge whether the quality of the refined segmentation is\nmore accurate than the coarse mask. We first obtained a coarse mask and refined\nit using SEPL (SAM Enhanced Pseduo-Labels) to get the two masks. Then, these\nbecome input to FSS model to judge whether the post-processing was successful.\nJFS is evaluated on the best and worst cases from SEPL to validate its\neffectiveness. The results showed that JFS can determine whether the SEPL is a\nsuccess or not.\n","authors":["Seonghyeon Moon","Haein Kong","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2407.04519v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2407.04513v1","updated":"2024-07-05T13:54:15Z","published":"2024-07-05T13:54:15Z","title":"LayerShuffle: Enhancing Robustness in Vision Transformers by Randomizing\n Layer Execution Order","summary":" Due to their architecture and how they are trained, artificial neural\nnetworks are typically not robust toward pruning, replacing, or shuffling\nlayers at test time. However, such properties would be desirable for different\napplications, such as distributed neural network architectures where the order\nof execution cannot be guaranteed or parts of the network can fail during\ninference. In this work, we address these issues through a number of proposed\ntraining approaches for vision transformers whose most important component is\nrandomizing the execution order of attention modules at training time. We show\nthat with our proposed approaches, vision transformers are indeed capable to\nadapt to arbitrary layer execution orders at test time assuming one tolerates a\nreduction (about 20\\%) in accuracy at the same model size. We also find that\nour trained models can be randomly merged with each other resulting in\nfunctional (\"Frankenstein\") models without loss of performance compared to the\nsource models. Finally, we layer-prune our models at test time and find that\ntheir performance declines gracefully.\n","authors":["Matthias Freiberger","Peter Kun","Anders Sundnes Løvlie","Sebastian Risi"],"pdf_url":"https://arxiv.org/pdf/2407.04513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04507v1","updated":"2024-07-05T13:46:11Z","published":"2024-07-05T13:46:11Z","title":"Few-Shot Airway-Tree Modeling using Data-Driven Sparse Priors","summary":" The lack of large annotated datasets in medical imaging is an intrinsic\nburden for supervised Deep Learning (DL) segmentation models. Few-shot learning\napproaches are cost-effective solutions to transfer pre-trained models using\nonly limited annotated data. However, such methods can be prone to overfitting\ndue to limited data diversity especially when segmenting complex, diverse, and\nsparse tubular structures like airways. Furthermore, crafting informative image\nrepresentations has played a crucial role in medical imaging, enabling\ndiscriminative enhancement of anatomical details. In this paper, we initially\ntrain a data-driven sparsification module to enhance airways efficiently in\nlung CT scans. We then incorporate these sparse representations in a standard\nsupervised segmentation pipeline as a pretraining step to enhance the\nperformance of the DL models. Results presented on the ATM public challenge\ncohort show the effectiveness of using sparse priors in pre-training, leading\nto segmentation Dice score increase by 1% to 10% in full-scale and few-shot\nlearning scenarios, respectively.\n","authors":["Ali Keshavarzi","Elsa Angelini"],"pdf_url":"https://arxiv.org/pdf/2407.04507v1.pdf","comment":"Accepted at 21st IEEE International Symposium on Biomedical Imaging\n (ISBI)"},{"id":"http://arxiv.org/abs/2407.04505v1","updated":"2024-07-05T13:45:11Z","published":"2024-07-05T13:45:11Z","title":"Hyperspectral Dataset and Deep Learning methods for Waste from Electric\n and Electronic Equipment Identification (WEEE)","summary":" Hyperspectral imaging, a rapidly evolving field, has witnessed the ascendancy\nof deep learning techniques, supplanting classical feature extraction and\nclassification methods in various applications. However, many researchers\nemploy arbitrary architectures for hyperspectral image processing, often\nwithout rigorous analysis of the interplay between spectral and spatial\ninformation. This oversight neglects the implications of combining these two\nmodalities on model performance.\n In this paper, we evaluate the performance of diverse deep learning\narchitectures for hyperspectral image segmentation. Our analysis disentangles\nthe impact of different architectures, spanning various spectral and spatial\ngranularities. Specifically, we investigate the effects of spectral resolution\n(capturing spectral information) and spatial texture (conveying spatial\ndetails) on segmentation outcomes. Additionally, we explore the transferability\nof knowledge from large pre-trained image foundation models, originally\ndesigned for RGB images, to the hyperspectral domain.\n Results show that incorporating spatial information alongside spectral data\nleads to improved segmentation results, and that it is essential to further\nwork on novel architectures comprising spectral and spatial information and on\nthe adaption of RGB foundation models into the hyperspectral domain.\n Furthermore, we contribute to the field by cleaning and publicly releasing\nthe Tecnalia WEEE Hyperspectral dataset. This dataset contains different\nnon-ferrous fractions of Waste Electrical and Electronic Equipment (WEEE),\nincluding Copper, Brass, Aluminum, Stainless Steel, and White Copper, spanning\nthe range of 400 to 1000 nm.\n We expect these conclusions can guide novel researchers in the field of\nhyperspectral imaging.\n","authors":["Artzai Picon","Pablo Galan","Arantza Bereciartua-Perez","Leire Benito-del-Valle"],"pdf_url":"https://arxiv.org/pdf/2407.04505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04504v1","updated":"2024-07-05T13:44:15Z","published":"2024-07-05T13:44:15Z","title":"Segment Any 4D Gaussians","summary":" Modeling, understanding, and reconstructing the real world are crucial in\nXR/VR. Recently, 3D Gaussian Splatting (3D-GS) methods have shown remarkable\nsuccess in modeling and understanding 3D scenes. Similarly, various 4D\nrepresentations have demonstrated the ability to capture the dynamics of the 4D\nworld. However, there is a dearth of research focusing on segmentation within\n4D representations. In this paper, we propose Segment Any 4D Gaussians (SA4D),\none of the first frameworks to segment anything in the 4D digital world based\non 4D Gaussians. In SA4D, an efficient temporal identity feature field is\nintroduced to handle Gaussian drifting, with the potential to learn precise\nidentity features from noisy and sparse input. Additionally, a 4D segmentation\nrefinement process is proposed to remove artifacts. Our SA4D achieves precise,\nhigh-quality segmentation within seconds in 4D Gaussians and shows the ability\nto remove, recolor, compose, and render high-quality anything masks. More demos\nare available at: https://jsxzs.github.io/sa4d/.\n","authors":["Shengxiang Ji","Guanjun Wu","Jiemin Fang","Jiazhong Cen","Taoran Yi","Wenyu Liu","Qi Tian","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04504v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2407.04490v1","updated":"2024-07-05T13:25:32Z","published":"2024-07-05T13:25:32Z","title":"Micro-gesture Online Recognition using Learnable Query Points","summary":" In this paper, we briefly introduce the solution developed by our team,\nHFUT-VUT, for the Micro-gesture Online Recognition track in the MiGA challenge\nat IJCAI 2024. The Micro-gesture Online Recognition task involves identifying\nthe category and locating the start and end times of micro-gestures in video\nclips. Compared to the typical Temporal Action Detection task, the\nMicro-gesture Online Recognition task focuses more on distinguishing between\nmicro-gestures and pinpointing the start and end times of actions. Our solution\nranks 2nd in the Micro-gesture Online Recognition track.\n","authors":["Pengyu Liu","Fei Wang","Kun Li","Guoliang Chen","Yanyan Wei","Shengeng Tang","Zhiliang Wu","Dan Guo"],"pdf_url":"https://arxiv.org/pdf/2407.04490v1.pdf","comment":"Technical Report of HFUT-VUT for the MiGA challenge at IJCAI 2024"},{"id":"http://arxiv.org/abs/2407.04489v1","updated":"2024-07-05T13:15:29Z","published":"2024-07-05T13:15:29Z","title":"Dude: Dual Distribution-Aware Context Prompt Learning For Large\n Vision-Language Model","summary":" Prompt learning methods are gaining increasing attention due to their ability\nto customize large vision-language models to new domains using pre-trained\ncontextual knowledge and minimal training data. However, existing works\ntypically rely on optimizing unified prompt inputs, often struggling with\nfine-grained classification tasks due to insufficient discriminative\nattributes. To tackle this, we consider a new framework based on a dual context\nof both domain-shared and class-specific contexts, where the latter is\ngenerated by Large Language Models (LLMs) such as GPTs. Such dual prompt\nmethods enhance the model's feature representation by joining implicit and\nexplicit factors encoded in LLM knowledge. Moreover, we formulate the\nUnbalanced Optimal Transport (UOT) theory to quantify the relationships between\nconstructed prompts and visual tokens. Through partial matching, UOT can\nproperly align discrete sets of visual tokens and prompt embeddings under\ndifferent mass distributions, which is particularly valuable for handling\nirrelevant or noisy elements, ensuring that the preservation of mass does not\nrestrict transport solutions. Furthermore, UOT's characteristics integrate\nseamlessly with image augmentation, expanding the training sample pool while\nmaintaining a reasonable distance between perturbed images and prompt inputs.\nExtensive experiments across few-shot classification and adapter settings\nsubstantiate the superiority of our model over current state-of-the-art\nbaselines.\n","authors":["Duy M. H. Nguyen","An T. Le","Trung Q. Nguyen","Nghiem T. Diep","Tai Nguyen","Duy Duong-Tran","Jan Peters","Li Shen","Mathias Niepert","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2407.04489v1.pdf","comment":"Version 1"},{"id":"http://arxiv.org/abs/2406.11643v3","updated":"2024-07-05T13:10:51Z","published":"2024-06-17T15:26:22Z","title":"AnyMaker: Zero-shot General Object Customization via Decoupled\n Dual-Level ID Injection","summary":" Text-to-image based object customization, aiming to generate images with the\nsame identity (ID) as objects of interest in accordance with text prompts and\nreference images, has made significant progress. However, recent customizing\nresearch is dominated by specialized tasks, such as human customization or\nvirtual try-on, leaving a gap in general object customization. To this end, we\nintroduce AnyMaker, an innovative zero-shot object customization framework\ncapable of generating general objects with high ID fidelity and flexible text\neditability. The efficacy of AnyMaker stems from its novel general ID\nextraction, dual-level ID injection, and ID-aware decoupling. Specifically, the\ngeneral ID extraction module extracts sufficient ID information with an\nensemble of self-supervised models to tackle the diverse customization tasks\nfor general objects. Then, to provide the diffusion UNet with the extracted ID\nas much while not damaging the text editability in the generation process, we\ndesign a global-local dual-level ID injection module, in which the global-level\nsemantic ID is injected into text descriptions while the local-level ID details\nare injected directly into the model through newly added cross-attention\nmodules. In addition, we propose an ID-aware decoupling module to disentangle\nID-related information from non-ID elements in the extracted representations\nfor high-fidelity generation of both identity and text descriptions. To\nvalidate our approach and boost the research of general object customization,\nwe create the first large-scale general ID dataset, Multi-Category\nID-Consistent (MC-IDC) dataset, with 315k text-image samples and 10k\ncategories. Experiments show that AnyMaker presents remarkable performance in\ngeneral object customization and outperforms specialized methods in\ncorresponding tasks. Code and dataset will be released soon.\n","authors":["Lingjie Kong","Kai Wu","Xiaobin Hu","Wenhui Han","Jinlong Peng","Chengming Xu","Donghao Luo","Jiangning Zhang","Chengjie Wang","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2406.11643v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06947v4","updated":"2024-07-05T13:08:10Z","published":"2023-12-12T03:04:08Z","title":"MaTe3D: Mask-guided Text-based 3D-aware Portrait Editing","summary":" 3D-aware portrait editing has a wide range of applications in multiple\nfields. However, current approaches are limited due that they can only perform\nmask-guided or text-based editing. Even by fusing the two procedures into a\nmodel, the editing quality and stability cannot be ensured. To address this\nlimitation, we propose \\textbf{MaTe3D}: mask-guided text-based 3D-aware\nportrait editing. In this framework, first, we introduce a new SDF-based 3D\ngenerator which learns local and global representations with proposed SDF and\ndensity consistency losses. This enhances masked-based editing in local areas;\nsecond, we present a novel distillation strategy: Conditional Distillation on\nGeometry and Texture (CDGT). Compared to exiting distillation strategies, it\nmitigates visual ambiguity and avoids mismatch between texture and geometry,\nthereby producing stable texture and convincing geometry while editing.\nAdditionally, we create the CatMask-HQ dataset, a large-scale high-resolution\ncat face annotation for exploration of model generalization and expansion. We\nperform expensive experiments on both the FFHQ and CatMask-HQ datasets to\ndemonstrate the editing quality and stability of the proposed method. Our\nmethod faithfully generates a 3D-aware edited face image based on a modified\nmask and a text prompt. Our code and models will be publicly released.\n","authors":["Kangneng Zhou","Daiheng Gao","Xuan Wang","Jie Zhang","Peng Zhang","Xusen Sun","Longhao Zhang","Shiqi Yang","Bang Zhang","Liefeng Bo","Yaxing Wang","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.06947v4.pdf","comment":"16 pages, 13 figures"},{"id":"http://arxiv.org/abs/2407.04484v1","updated":"2024-07-05T13:07:40Z","published":"2024-07-05T13:07:40Z","title":"Optimizing the image correction pipeline for pedestrian detection in the\n thermal-infrared domain","summary":" Infrared imagery can help in low-visibility situations such as fog and\nlow-light scenarios, but it is prone to thermal noise and requires further\nprocessing and correction. This work studies the effect of different infrared\nprocessing pipelines on the performance of a pedestrian detection in an urban\nenvironment, similar to autonomous driving scenarios. Detection on infrared\nimages is shown to outperform that on visible images, but the infrared\ncorrection pipeline is crucial since the models cannot extract information from\nraw infrared images. Two thermal correction pipelines are studied, the shutter\nand the shutterless pipes. Experiments show that some correction algorithms\nlike spatial denoising are detrimental to performance even if they increase\nvisual quality for a human observer. Other algorithms like destriping and, to a\nlesser extent, temporal denoising, increase computational time, but have some\nrole to play in increasing detection accuracy. As it stands, the optimal\ntrade-off for speed and accuracy is simply to use the shutterless pipe with a\ntonemapping algorithm only, for autonomous driving applications within varied\nenvironments.\n","authors":["Christophe Karam","Jessy Matias","Xavier Breniere","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2407.04484v1.pdf","comment":"9 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.18991v2","updated":"2024-07-05T13:01:07Z","published":"2024-05-29T11:11:07Z","title":"EasyAnimate: A High-Performance Long Video Generation Method based on\n Transformer Architecture","summary":" This paper presents EasyAnimate, an advanced method for video generation that\nleverages the power of transformer architecture for high-performance outcomes.\nWe have expanded the DiT framework originally designed for 2D image synthesis\nto accommodate the complexities of 3D video generation by incorporating a\nmotion module block. It is used to capture temporal dynamics, thereby ensuring\nthe production of consistent frames and seamless motion transitions. The motion\nmodule can be adapted to various DiT baseline methods to generate video with\ndifferent styles. It can also generate videos with different frame rates and\nresolutions during both training and inference phases, suitable for both images\nand videos. Moreover, we introduce slice VAE, a novel approach to condense the\ntemporal axis, facilitating the generation of long duration videos. Currently,\nEasyAnimate exhibits the proficiency to generate videos with 144 frames. We\nprovide a holistic ecosystem for video production based on DiT, encompassing\naspects such as data pre-processing, VAE training, DiT models training (both\nthe baseline model and LoRA model), and end-to-end video inference. Code is\navailable at: https://github.com/aigc-apps/EasyAnimate. We are continuously\nworking to enhance the performance of our method.\n","authors":["Jiaqi Xu","Xinyi Zou","Kunzhe Huang","Yunkuo Chen","Bo Liu","MengLi Cheng","Xing Shi","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2405.18991v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.04476v1","updated":"2024-07-05T12:53:34Z","published":"2024-07-05T12:53:34Z","title":"Rethinking Data Input for Point Cloud Upsampling","summary":" In recent years, point cloud upsampling has been widely applied in fields\nsuch as 3D reconstruction and surface generation. However, existing point cloud\nupsampling inputs are all patch based, and there is no research discussing the\ndifferences and principles between point cloud model full input and patch based\ninput. In order to compare with patch based point cloud input, this article\nproposes a new data input method, which divides the full point cloud model to\nensure shape integrity while training PU-GCN. This article was validated on the\nPU1K and ABC datasets, but the results showed that Patch based performance is\nbetter than model based full input i.e. Average Segment input. Therefore, this\narticle explores the data input factors and model modules that affect the\nupsampling results of point clouds.\n","authors":["Tongxu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04476v1.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.02350v2","updated":"2024-07-05T12:45:32Z","published":"2024-07-02T15:16:06Z","title":"Conceptual Codebook Learning for Vision-Language Models","summary":" In this paper, we propose Conceptual Codebook Learning (CoCoLe), a novel\nfine-tuning method for vision-language models (VLMs) to address the challenge\nof improving the generalization capability of VLMs while fine-tuning them on\ndownstream tasks in a few-shot setting. We recognize that visual concepts, such\nas textures, shapes, and colors are naturally transferable across domains and\nplay a crucial role in generalization tasks. Motivated by this interesting\nfinding, we learn a conceptual codebook consisting of visual concepts as keys\nand conceptual prompts as values, which serves as a link between the image\nencoder's outputs and the text encoder's inputs. Specifically, for a given\nimage, we leverage the codebook to identify the most relevant conceptual\nprompts associated with the class embeddings to perform the classification.\nAdditionally, we incorporate a handcrafted concept cache as a regularization to\nalleviate the overfitting issues in low-shot scenarios. We observe that this\nconceptual codebook learning method is able to achieve enhanced alignment\nbetween visual and linguistic modalities. Extensive experimental results\ndemonstrate that our CoCoLe method remarkably outperforms the existing\nstate-of-the-art methods across various evaluation settings, including\nbase-to-new generalization, cross-dataset evaluation, and domain generalization\ntasks. Detailed ablation studies further confirm the efficacy of each component\nin CoCoLe.\n","authors":["Yi Zhang","Ke Yu","Siqi Wu","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2407.02350v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.02844v2","updated":"2024-07-05T12:37:15Z","published":"2024-07-03T06:40:26Z","title":"Multi-Attention Integrated Deep Learning Frameworks for Enhanced Breast\n Cancer Segmentation and Identification","summary":" Breast cancer poses a profound threat to lives globally, claiming numerous\nlives each year. Therefore, timely detection is crucial for early intervention\nand improved chances of survival. Accurately diagnosing and classifying breast\ntumors using ultrasound images is a persistent challenge in medicine, demanding\ncutting-edge solutions for improved treatment strategies. This research\nintroduces multiattention-enhanced deep learning (DL) frameworks designed for\nthe classification and segmentation of breast cancer tumors from ultrasound\nimages. A spatial channel attention mechanism is proposed for segmenting tumors\nfrom ultrasound images, utilizing a novel LinkNet DL framework with an\nInceptionResNet backbone. Following this, the paper proposes a deep\nconvolutional neural network with an integrated multi-attention framework\n(DCNNIMAF) to classify the segmented tumor as benign, malignant, or normal.\nFrom experimental results, it is observed that the segmentation model has\nrecorded an accuracy of 98.1%, with a minimal loss of 0.6%. It has also\nachieved high Intersection over Union (IoU) and Dice Coefficient scores of\n96.9% and 97.2%, respectively. Similarly, the classification model has attained\nan accuracy of 99.2%, with a low loss of 0.31%. Furthermore, the classification\nframework has achieved outstanding F1-Score, precision, and recall values of\n99.1%, 99.3%, and 99.1%, respectively. By offering a robust framework for early\ndetection and accurate classification of breast cancer, this proposed work\nsignificantly advances the field of medical image analysis, potentially\nimproving diagnostic precision and patient outcomes.\n","authors":["Pandiyaraju V","Shravan Venkatraman","Pavan Kumar S","Santhosh Malarvannan","Kannan A"],"pdf_url":"https://arxiv.org/pdf/2407.02844v2.pdf","comment":"29 pages, 15 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.04461v1","updated":"2024-07-05T12:11:33Z","published":"2024-07-05T12:11:33Z","title":"VCD-Texture: Variance Alignment based 3D-2D Co-Denoising for Text-Guided\n Texturing","summary":" Recent research on texture synthesis for 3D shapes benefits a lot from\ndramatically developed 2D text-to-image diffusion models, including\ninpainting-based and optimization-based approaches. However, these methods\nignore the modal gap between the 2D diffusion model and 3D objects, which\nprimarily render 3D objects into 2D images and texture each image separately.\nIn this paper, we revisit the texture synthesis and propose a Variance\nalignment based 3D-2D Collaborative Denoising framework, dubbed VCD-Texture, to\naddress these issues. Formally, we first unify both 2D and 3D latent feature\nlearning in diffusion self-attention modules with re-projected 3D attention\nreceptive fields. Subsequently, the denoised multi-view 2D latent features are\naggregated into 3D space and then rasterized back to formulate more consistent\n2D predictions. However, the rasterization process suffers from an intractable\nvariance bias, which is theoretically addressed by the proposed variance\nalignment, achieving high-fidelity texture synthesis. Moreover, we present an\ninpainting refinement to further improve the details with conflicting regions.\nNotably, there is not a publicly available benchmark to evaluate texture\nsynthesis, which hinders its development. Thus we construct a new evaluation\nset built upon three open-source 3D datasets and propose to use four metrics to\nthoroughly validate the texturing performance. Comprehensive experiments\ndemonstrate that VCD-Texture achieves superior performance against other\ncounterparts.\n","authors":["Shang Liu","Chaohui Yu","Chenjie Cao","Wen Qian","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04461v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.04458v1","updated":"2024-07-05T12:09:33Z","published":"2024-07-05T12:09:33Z","title":"Robust Multimodal Learning via Representation Decoupling","summary":" Multimodal learning robust to missing modality has attracted increasing\nattention due to its practicality. Existing methods tend to address it by\nlearning a common subspace representation for different modality combinations.\nHowever, we reveal that they are sub-optimal due to their implicit constraint\non intra-class representation. Specifically, the sample with different\nmodalities within the same class will be forced to learn representations in the\nsame direction. This hinders the model from capturing modality-specific\ninformation, resulting in insufficient learning. To this end, we propose a\nnovel Decoupled Multimodal Representation Network (DMRNet) to assist robust\nmultimodal learning. Specifically, DMRNet models the input from different\nmodality combinations as a probabilistic distribution instead of a fixed point\nin the latent space, and samples embeddings from the distribution for the\nprediction module to calculate the task loss. As a result, the direction\nconstraint from the loss minimization is blocked by the sampled representation.\nThis relaxes the constraint on the inference representation and enables the\nmodel to capture the specific information for different modality combinations.\nFurthermore, we introduce a hard combination regularizer to prevent DMRNet from\nunbalanced training by guiding it to pay more attention to hard modality\ncombinations. Finally, extensive experiments on multimodal classification and\nsegmentation tasks demonstrate that the proposed DMRNet outperforms the\nstate-of-the-art significantly.\n","authors":["Shicai Wei","Yang Luo","Yuji Wang","Chunbo Luo"],"pdf_url":"https://arxiv.org/pdf/2407.04458v1.pdf","comment":"ECCV2024 17 pages"},{"id":"http://arxiv.org/abs/2407.04449v1","updated":"2024-07-05T12:04:12Z","published":"2024-07-05T12:04:12Z","title":"Multi-modal Masked Siamese Network Improves Chest X-Ray Representation\n Learning","summary":" Self-supervised learning methods for medical images primarily rely on the\nimaging modality during pretraining. While such approaches deliver promising\nresults, they do not leverage associated patient or scan information collected\nwithin Electronic Health Records (EHR). Here, we propose to incorporate EHR\ndata during self-supervised pretraining with a Masked Siamese Network (MSN) to\nenhance the quality of chest X-ray representations. We investigate three types\nof EHR data, including demographic, scan metadata, and inpatient stay\ninformation. We evaluate our approach on three publicly available chest X-ray\ndatasets, MIMIC-CXR, CheXpert, and NIH-14, using two vision transformer (ViT)\nbackbones, specifically ViT-Tiny and ViT-Small. In assessing the quality of the\nrepresentations via linear evaluation, our proposed method demonstrates\nsignificant improvement compared to vanilla MSN and state-of-the-art\nself-supervised learning baselines. Our work highlights the potential of\nEHR-enhanced self-supervised pre-training for medical imaging. The code is\npublicly available at: https://github.com/nyuad-cai/CXR-EHR-MSN\n","authors":["Saeed Shurrab","Alejandro Guerra-Manzanares","Farah E. Shamout"],"pdf_url":"https://arxiv.org/pdf/2407.04449v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2401.12761v2","updated":"2024-07-05T12:00:37Z","published":"2024-01-23T13:43:17Z","title":"MUSES: The Multi-Sensor Semantic Perception Dataset for Driving under\n Uncertainty","summary":" Achieving level-5 driving automation in autonomous vehicles necessitates a\nrobust semantic visual perception system capable of parsing data from different\nsensors across diverse conditions. However, existing semantic perception\ndatasets often lack important non-camera modalities typically used in\nautonomous vehicles, or they do not exploit such modalities to aid and improve\nsemantic annotations in challenging conditions. To address this, we introduce\nMUSES, the MUlti-SEnsor Semantic perception dataset for driving in adverse\nconditions under increased uncertainty. MUSES includes synchronized multimodal\nrecordings with 2D panoptic annotations for 2500 images captured under diverse\nweather and illumination. The dataset integrates a frame camera, a lidar, a\nradar, an event camera, and an IMU/GNSS sensor. Our new two-stage panoptic\nannotation protocol captures both class-level and instance-level uncertainty in\nthe ground truth and enables the novel task of uncertainty-aware panoptic\nsegmentation we introduce, along with standard semantic and panoptic\nsegmentation. MUSES proves both effective for training and challenging for\nevaluating models under diverse visual conditions, and it opens new avenues for\nresearch in multimodal and uncertainty-aware dense semantic perception. Our\ndataset and benchmark are publicly available at\nhttps://muses.vision.ee.ethz.ch.\n","authors":["Tim Brödermann","David Bruggemann","Christos Sakaridis","Kevin Ta","Odysseas Liagouris","Jason Corkill","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2401.12761v2.pdf","comment":"Dataset available at http://muses.vision.ee.ethz.ch"},{"id":"http://arxiv.org/abs/2403.11105v2","updated":"2024-07-05T11:16:54Z","published":"2024-03-17T06:19:30Z","title":"Source Prompt Disentangled Inversion for Boosting Image Editability with\n Diffusion Models","summary":" Text-driven diffusion models have significantly advanced the image editing\nperformance by using text prompts as inputs. One crucial step in text-driven\nimage editing is to invert the original image into a latent noise code\nconditioned on the source prompt. While previous methods have achieved\npromising results by refactoring the image synthesizing process, the inverted\nlatent noise code is tightly coupled with the source prompt, limiting the image\neditability by target text prompts. To address this issue, we propose a novel\nmethod called Source Prompt Disentangled Inversion (SPDInv), which aims at\nreducing the impact of source prompt, thereby enhancing the text-driven image\nediting performance by employing diffusion models. To make the inverted noise\ncode be independent of the given source prompt as much as possible, we indicate\nthat the iterative inversion process should satisfy a fixed-point constraint.\nConsequently, we transform the inversion problem into a searching problem to\nfind the fixed-point solution, and utilize the pre-trained diffusion models to\nfacilitate the searching process. The experimental results show that our\nproposed SPDInv method can effectively mitigate the conflicts between the\ntarget editing prompt and the source prompt, leading to a significant decrease\nin editing artifacts. In addition to text-driven image editing, with SPDInv we\ncan easily adapt customized image generation models to localized editing tasks\nand produce promising performance. The source code are available at\nhttps://github.com/leeruibin/SPDInv.\n","authors":["Ruibin Li","Ruihuang Li","Song Guo","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.11105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09682v3","updated":"2024-07-05T10:53:07Z","published":"2024-05-15T19:53:52Z","title":"UDA4Inst: Unsupervised Domain Adaptation for Instance Segmentation","summary":" Unsupervised Domain Adaptation (UDA) aims to transfer knowledge learned from\na labeled source domain to an unlabeled target domain. While UDA methods for\nsynthetic to real-world domains (synth-to-real) show remarkable performance in\ntasks such as semantic segmentation and object detection, very few were\nproposed for instance segmentation in the field of vision-based autonomous\ndriving, and the existing ones are based on a suboptimal baseline, which\nseverely limits the performance. In this paper, we introduce UDA4Inst, a strong\nbaseline of synth-to-real UDA for instance segmentation. UDA4Inst adopts\ncross-domain bidirectional data mixing at the instance level to effectively\nutilize data from both source and target domains. Rare-class balancing and\ncategory module training are also employed to further improve the performance.\nIt is worth noting that we are the first to demonstrate results on two new\nsynth-to-real instance segmentation benchmarks, with 39.0 mAP on\nUrbanSyn->Cityscapes and 35.7 mAP on Synscapes->Cityscapes. Our method\noutperforms the source-only Mask2Former model by +7 mAP and +7.6 mAP,\nrespectively. On SYNTHIA->Cityscapes, our method improves the source-only\nMask2Former by +6.7 mAP, achieving state-of-the-art results.Our code will be\nreleased soon.\n","authors":["Yachan Guo","Yi Xiao","Danna Xue","Jose Luis Gomez Zurita","Antonio M. López"],"pdf_url":"https://arxiv.org/pdf/2405.09682v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18571v2","updated":"2024-07-05T10:42:03Z","published":"2024-06-03T11:48:17Z","title":"UltraCortex: Submillimeter Ultra-High Field 9.4 T1 Brain MR Image\n Collection and Manual Cortical Segmentations","summary":" The UltraCortex repository (https://www.ultracortex.org) houses magnetic\nresonance imaging data of the human brain obtained at an ultra-high field\nstrength of 9.4 T. It contains 86 structural MR images with spatial resolutions\nranging from 0.6 to 0.8 mm. Additionally, the repository includes segmentations\nof 12 brains into gray and white matter compartments. These segmentations have\nbeen independently validated by two expert neuroradiologists, thus establishing\nthem as a reliable gold standard. This resource provides researchers with\naccess to high-quality brain imaging data and validated segmentations,\nfacilitating neuroimaging studies and advancing our understanding of brain\nstructure and function. Existing repositories do not accommodate field\nstrengths beyond 7 T, nor do they offer validated segmentations, underscoring\nthe significance of this new resource.\n","authors":["Lucas Mahler","Julius Steiglechner","Benjamin Bender","Tobias Lindig","Dana Ramadan","Jonas Bause","Florian Birk","Rahel Heule","Edyta Charyasz","Michael Erb","Vinod Jangir Kumar","Gisela E Hagberg","Pascal Martin","Gabriele Lohmann","Klaus Scheffler"],"pdf_url":"https://arxiv.org/pdf/2406.18571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04400v1","updated":"2024-07-05T10:20:24Z","published":"2024-07-05T10:20:24Z","title":"Hard-Attention Gates with Gradient Routing for Endoscopic Image\n Computing","summary":" To address overfitting and enhance model generalization in\ngastroenterological polyp size assessment, our study introduces\nFeature-Selection Gates (FSG) or Hard-Attention Gates (HAG) alongside Gradient\nRouting (GR) for dynamic feature selection. This technique aims to boost\nConvolutional Neural Networks (CNNs) and Vision Transformers (ViTs) by\npromoting sparse connectivity, thereby reducing overfitting and enhancing\ngeneralization. HAG achieves this through sparsification with learnable\nweights, serving as a regularization strategy. GR further refines this process\nby optimizing HAG parameters via dual forward passes, independently from the\nmain model, to improve feature re-weighting. Our evaluation spanned multiple\ndatasets, including CIFAR-100 for a broad impact assessment and specialized\nendoscopic datasets (REAL-Colon, Misawa, and SUN) focusing on polyp size\nestimation, covering over 200 polyps in more than 370,000 frames. The findings\nindicate that our HAG-enhanced networks substantially enhance performance in\nboth binary and triclass classification tasks related to polyp sizing.\nSpecifically, CNNs experienced an F1 Score improvement to 87.8% in binary\nclassification, while in triclass classification, the ViT-T model reached an F1\nScore of 76.5%, outperforming traditional CNNs and ViT-T models. To facilitate\nfurther research, we are releasing our codebase, which includes implementations\nfor CNNs, multistream CNNs, ViT, and HAG-augmented variants. This resource aims\nto standardize the use of endoscopic datasets, providing public\ntraining-validation-testing splits for reliable and comparable research in\ngastroenterological polyp size estimation. The codebase is available at\ngithub.com/cosmoimd/feature-selection-gates.\n","authors":["Giorgio Roffo","Carlo Biffi","Pietro Salvagnini","Andrea Cherubini"],"pdf_url":"https://arxiv.org/pdf/2407.04400v1.pdf","comment":"Attention Gates, Hard-Attention Gates, Gradient Routing, Feature\n Selection Gates, Endoscopy, Medical Image Processing, Computer Vision"},{"id":"http://arxiv.org/abs/2407.04396v1","updated":"2024-07-05T10:06:55Z","published":"2024-07-05T10:06:55Z","title":"Graph-Guided Test-Time Adaptation for Glaucoma Diagnosis using Fundus\n Photography","summary":" Glaucoma is a leading cause of irreversible blindness worldwide. While deep\nlearning approaches using fundus images have largely improved early diagnosis\nof glaucoma, variations in images from different devices and locations (known\nas domain shifts) challenge the use of pre-trained models in real-world\nsettings. To address this, we propose a novel Graph-guided Test-Time Adaptation\n(GTTA) framework to generalize glaucoma diagnosis models to unseen test\nenvironments. GTTA integrates the topological information of fundus images into\nthe model training, enhancing the model's transferability and reducing the risk\nof learning spurious correlation. During inference, GTTA introduces a novel\ntest-time training objective to make the source-trained classifier\nprogressively adapt to target patterns with reliable class conditional\nestimation and consistency regularization. Experiments on cross-domain glaucoma\ndiagnosis benchmarks demonstrate the superiority of the overall framework and\nindividual components under different backbone networks.\n","authors":["Qian Zeng","Fan Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04396v1.pdf","comment":"11 pages, 3 figures, 3 tables, submitted to MICCAI"},{"id":"http://arxiv.org/abs/2312.08888v3","updated":"2024-07-05T09:43:41Z","published":"2023-12-13T13:11:44Z","title":"Read Between the Layers: Leveraging Multi-Layer Representations for\n Rehearsal-Free Continual Learning with Pre-Trained Models","summary":" We address the Continual Learning (CL) problem, wherein a model must learn a\nsequence of tasks from non-stationary distributions while preserving prior\nknowledge upon encountering new experiences. With the advancement of foundation\nmodels, CL research has pivoted from the initial learning-from-scratch paradigm\ntowards utilizing generic features from large-scale pre-training. However,\nexisting approaches to CL with pre-trained models primarily focus on separating\nclass-specific features from the final representation layer and neglect the\npotential of intermediate representations to capture low- and mid-level\nfeatures, which are more invariant to domain shifts. In this work, we propose\nLayUP, a new prototype-based approach to CL that leverages second-order feature\nstatistics from multiple intermediate layers of a pre-trained network. Our\nmethod is conceptually simple, does not require access to prior data, and works\nout of the box with any foundation model. LayUP surpasses the state of the art\nin four of the seven class-incremental learning benchmarks, all three\ndomain-incremental learning benchmarks and in six of the seven online continual\nlearning benchmarks, while significantly reducing memory and computational\nrequirements compared to existing baselines. Our results demonstrate that fully\nexhausting the representational capacities of pre-trained models in CL goes\nwell beyond their final embeddings.\n","authors":["Kyra Ahrens","Hans Hergen Lehmann","Jae Hee Lee","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2312.08888v3.pdf","comment":"Accepted for publication in Transactions of Machine Learning Research\n (TMLR) journal"},{"id":"http://arxiv.org/abs/2407.04384v1","updated":"2024-07-05T09:43:05Z","published":"2024-07-05T09:43:05Z","title":"Unsupervised Learning of Category-Level 3D Pose from Object-Centric\n Videos","summary":" Category-level 3D pose estimation is a fundamentally important problem in\ncomputer vision and robotics, e.g. for embodied agents or to train 3D\ngenerative models. However, so far methods that estimate the category-level\nobject pose require either large amounts of human annotations, CAD models or\ninput from RGB-D sensors. In contrast, we tackle the problem of learning to\nestimate the category-level 3D pose only from casually taken object-centric\nvideos without human supervision. We propose a two-step pipeline: First, we\nintroduce a multi-view alignment procedure that determines canonical camera\nposes across videos with a novel and robust cyclic distance formulation for\ngeometric and appearance matching using reconstructed coarse meshes and DINOv2\nfeatures. In a second step, the canonical poses and reconstructed meshes enable\nus to train a model for 3D pose estimation from a single image. In particular,\nour model learns to estimate dense correspondences between images and a\nprototypical 3D template by predicting, for each pixel in a 2D image, a feature\nvector of the corresponding vertex in the template mesh. We demonstrate that\nour method outperforms all baselines at the unsupervised alignment of\nobject-centric videos by a large margin and provides faithful and robust\npredictions in-the-wild. Our code and data is available at\nhttps://github.com/GenIntel/uns-obj-pose3d.\n","authors":["Leonhard Sommer","Artur Jesslen","Eddy Ilg","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2407.04384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04382v1","updated":"2024-07-05T09:37:16Z","published":"2024-07-05T09:37:16Z","title":"Self-Supervised Representation Learning for Adversarial Attack Detection","summary":" Supervised learning-based adversarial attack detection methods rely on a\nlarge number of labeled data and suffer significant performance degradation\nwhen applying the trained model to new domains. In this paper, we propose a\nself-supervised representation learning framework for the adversarial attack\ndetection task to address this drawback. Firstly, we map the pixels of\naugmented input images into an embedding space. Then, we employ the\nprototype-wise contrastive estimation loss to cluster prototypes as latent\nvariables. Additionally, drawing inspiration from the concept of memory banks,\nwe introduce a discrimination bank to distinguish and learn representations for\neach individual instance that shares the same or a similar prototype,\nestablishing a connection between instances and their associated prototypes. We\npropose a parallel axial-attention (PAA)-based encoder to facilitate the\ntraining process by parallel training over height- and width-axis of attention\nmaps. Experimental results show that, compared to various benchmark\nself-supervised vision learning models and supervised adversarial attack\ndetection methods, the proposed model achieves state-of-the-art performance on\nthe adversarial attack detection task across a wide range of images.\n","authors":["Yi Li","Plamen Angelov","Neeraj Suri"],"pdf_url":"https://arxiv.org/pdf/2407.04382v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.04381v1","updated":"2024-07-05T09:35:30Z","published":"2024-07-05T09:35:30Z","title":"Multi-Branch Auxiliary Fusion YOLO with Re-parameterization\n Heterogeneous Convolutional for accurate object detection","summary":" Due to the effective performance of multi-scale feature fusion, Path\nAggregation FPN (PAFPN) is widely employed in YOLO detectors. However, it\ncannot efficiently and adaptively integrate high-level semantic information\nwith low-level spatial information simultaneously. We propose a new model named\nMAF-YOLO in this paper, which is a novel object detection framework with a\nversatile neck named Multi-Branch Auxiliary FPN (MAFPN). Within MAFPN, the\nSuperficial Assisted Fusion (SAF) module is designed to combine the output of\nthe backbone with the neck, preserving an optimal level of shallow information\nto facilitate subsequent learning. Meanwhile, the Advanced Assisted Fusion\n(AAF) module deeply embedded within the neck conveys a more diverse range of\ngradient information to the output layer.\n Furthermore, our proposed Re-parameterized Heterogeneous Efficient Layer\nAggregation Network (RepHELAN) module ensures that both the overall model\narchitecture and convolutional design embrace the utilization of heterogeneous\nlarge convolution kernels. Therefore, this guarantees the preservation of\ninformation related to small targets while simultaneously achieving the\nmulti-scale receptive field. Finally, taking the nano version of MAF-YOLO for\nexample, it can achieve 42.4% AP on COCO with only 3.76M learnable parameters\nand 10.51G FLOPs, and approximately outperforms YOLOv8n by about 5.1%. The\nsource code of this work is available at:\nhttps://github.com/yang-0201/MAF-YOLO.\n","authors":["Zhiqiang Yang","Qiu Guan","Keer Zhao","Jianmin Yang","Xinli Xu","Haixia Long","Ying Tang"],"pdf_url":"https://arxiv.org/pdf/2407.04381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04369v1","updated":"2024-07-05T09:16:30Z","published":"2024-07-05T09:16:30Z","title":"ZARRIO @ Ego4D Short Term Object Interaction Anticipation Challenge:\n Leveraging Affordances and Attention-based models for STA","summary":" Short-Term object-interaction Anticipation (STA) consists of detecting the\nlocation of the next-active objects, the noun and verb categories of the\ninteraction, and the time to contact from the observation of egocentric video.\nWe propose STAformer, a novel attention-based architecture integrating\nframe-guided temporal pooling, dual image-video attention, and multi-scale\nfeature fusion to support STA predictions from an image-input video pair.\nMoreover, we introduce two novel modules to ground STA predictions on human\nbehavior by modeling affordances. First, we integrate an environment affordance\nmodel which acts as a persistent memory of interactions that can take place in\na given physical scene. Second, we predict interaction hotspots from the\nobservation of hands and object trajectories, increasing confidence in STA\npredictions localized around the hotspot. On the test set, our results obtain a\nfinal 33.5 N mAP, 17.25 N+V mAP, 11.77 N+{\\delta} mAP and 6.75 Overall top-5\nmAP metric when trained on the v2 training dataset.\n","authors":["Lorenzo Mur-Labadia","Ruben Martinez-Cantin","Josechu Guerrero-Campo","Giovanni Maria Farinella"],"pdf_url":"https://arxiv.org/pdf/2407.04369v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2406.01194"},{"id":"http://arxiv.org/abs/2407.04362v1","updated":"2024-07-05T09:03:52Z","published":"2024-07-05T09:03:52Z","title":"Towards Context-aware Support for Color Vision Deficiency: An Approach\n Integrating LLM and AR","summary":" People with color vision deficiency often face challenges in distinguishing\ncolors such as red and green, which can complicate daily tasks and require the\nuse of assistive tools or environmental adjustments. Current support tools\nmainly focus on presentation-based aids, like the color vision modes found in\niPhone accessibility settings. However, offering context-aware support, like\nindicating the doneness of meat, remains a challenge since task-specific\nsolutions are not cost-effective for all possible scenarios. To address this,\nour paper proposes an application that provides contextual and autonomous\nassistance. This application is mainly composed of: (i) an augmented reality\ninterface that efficiently captures context; and (ii) a multi-modal large\nlanguage model-based reasoner that serves to cognitize the context and then\nreason about the appropriate support contents. Preliminary user experiments\nwith two color vision deficient users across five different scenarios have\ndemonstrated the effectiveness and universality of our application.\n","authors":["Shogo Morita","Yan Zhang","Takuto Yamauchi","Sinan Chen","Jialong Li","Kenji Tei"],"pdf_url":"https://arxiv.org/pdf/2407.04362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04360v1","updated":"2024-07-05T08:59:34Z","published":"2024-07-05T08:59:34Z","title":"Shape Prior Segmentation Guided by Harmonic Beltrami Signature","summary":" This paper presents a novel shape prior segmentation method guided by the\nHarmonic Beltrami Signature (HBS). The HBS is a shape representation fully\ncapturing 2D simply connected shapes, exhibiting resilience against\nperturbations and invariance to translation, rotation, and scaling. The\nproposed method integrates the HBS within a quasi-conformal topology preserving\nsegmentation framework, leveraging shape prior knowledge to significantly\nenhance segmentation performance, especially for low-quality or occluded\nimages. The key innovation lies in the bifurcation of the optimization process\ninto two iterative stages: 1) The computation of a quasi-conformal deformation\nmap, which transforms the unit disk into the targeted segmentation area, driven\nby image data and other regularization terms; 2) The subsequent refinement of\nthis map is contingent upon minimizing the $L_2$ distance between its Beltrami\ncoefficient and the reference HBS. This shape-constrained refinement ensures\nthat the segmentation adheres to the reference shape(s) by exploiting the\ninherent invariance, robustness, and discerning shape discriminative\ncapabilities afforded by the HBS. Extensive experiments on synthetic and\nreal-world images validate the method's ability to improve segmentation\naccuracy over baselines, eliminate preprocessing requirements, resist noise\ncorruption, and flexibly acquire and apply shape priors. Overall, the HBS\nsegmentation framework offers an efficient strategy to robustly incorporate the\nshape prior knowledge, thereby advancing critical low-level vision tasks.\n","authors":["Chenran Lin","Lok Ming Lui"],"pdf_url":"https://arxiv.org/pdf/2407.04360v1.pdf","comment":"34 pages, 15 figures"},{"id":"http://arxiv.org/abs/2407.02218v2","updated":"2024-07-05T08:55:00Z","published":"2024-07-02T12:34:17Z","title":"Multi-Modal Video Dialog State Tracking in the Wild","summary":" We present MST-MIXER - a novel video dialog model operating over a generic\nmulti-modal state tracking scheme. Current models that claim to perform\nmulti-modal state tracking fall short of two major aspects: (1) They either\ntrack only one modality (mostly the visual input) or (2) they target synthetic\ndatasets that do not reflect the complexity of real-world in the wild\nscenarios. Our model addresses these two limitations in an attempt to close\nthis crucial research gap. Specifically, MST-MIXER first tracks the most\nimportant constituents of each input modality. Then, it predicts the missing\nunderlying structure of the selected constituents of each modality by learning\nlocal latent graphs using a novel multi-modal graph structure learning method.\nSubsequently, the learned local graphs and features are parsed together to form\na global graph operating on the mix of all modalities which further refines its\nstructure and node embeddings. Finally, the fine-grained graph node features\nare used to enhance the hidden states of the backbone Vision-Language Model\n(VLM). MST-MIXER achieves new state-of-the-art results on five challenging\nbenchmarks.\n","authors":["Adnen Abdessaied","Lei Shi","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2407.02218v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2205.09180v3","updated":"2024-07-05T08:51:16Z","published":"2022-05-18T18:57:36Z","title":"Learning Rate Curriculum","summary":" Most curriculum learning methods require an approach to sort the data samples\nby difficulty, which is often cumbersome to perform. In this work, we propose a\nnovel curriculum learning approach termed Learning Rate Curriculum (LeRaC),\nwhich leverages the use of a different learning rate for each layer of a neural\nnetwork to create a data-agnostic curriculum during the initial training\nepochs. More specifically, LeRaC assigns higher learning rates to neural layers\ncloser to the input, gradually decreasing the learning rates as the layers are\nplaced farther away from the input. The learning rates increase at various\npaces during the first training iterations, until they all reach the same\nvalue. From this point on, the neural model is trained as usual. This creates a\nmodel-level curriculum learning strategy that does not require sorting the\nexamples by difficulty and is compatible with any neural network, generating\nhigher performance levels regardless of the architecture. We conduct\ncomprehensive experiments on 12 data sets from the computer vision (CIFAR-10,\nCIFAR-100, Tiny ImageNet, ImageNet-200, Food-101, UTKFace, PASCAL VOC),\nlanguage (BoolQ, QNLI, RTE) and audio (ESC-50, CREMA-D) domains, considering\nvarious convolutional (ResNet-18, Wide-ResNet-50, DenseNet-121, YOLOv5),\nrecurrent (LSTM) and transformer (CvT, BERT, SepTr) architectures. We compare\nour approach with the conventional training regime, as well as with Curriculum\nby Smoothing (CBS), a state-of-the-art data-agnostic curriculum learning\napproach. Unlike CBS, our performance improvements over the standard training\nregime are consistent across all data sets and models. Furthermore, we\nsignificantly surpass CBS in terms of training time (there is no additional\ncost over the standard training regime for LeRaC). Our code is freely available\nat: https://github.com/CroitoruAlin/LeRaC.\n","authors":["Florinel-Alin Croitoru","Nicolae-Catalin Ristea","Radu Tudor Ionescu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2205.09180v3.pdf","comment":"Accepted at the International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2407.04355v1","updated":"2024-07-05T08:50:55Z","published":"2024-07-05T08:50:55Z","title":"Data-Driven Tissue- and Subject-Specific Elastic Regularization for\n Medical Image Registration","summary":" Physics-inspired regularization is desired for intra-patient image\nregistration since it can effectively capture the biomechanical characteristics\nof anatomical structures. However, a major challenge lies in the reliance on\nphysical parameters: Parameter estimations vary widely across the literature,\nand the physical properties themselves are inherently subject-specific. In this\nwork, we introduce a novel data-driven method that leverages hypernetworks to\nlearn the tissue-dependent elasticity parameters of an elastic regularizer.\nNotably, our approach facilitates the estimation of patient-specific parameters\nwithout the need to retrain the network. We evaluate our method on three\npublicly available 2D and 3D lung CT and cardiac MR datasets. We find that with\nour proposed subject-specific tissue-dependent regularization, a higher\nregistration quality is achieved across all datasets compared to using a global\nregularizer. The code is available at\nhttps://github.com/compai-lab/2024-miccai-reithmeir.\n","authors":["Anna Reithmeir","Lina Felsner","Rickmer Braren","Julia A. Schnabel","Veronika A. Zimmer"],"pdf_url":"https://arxiv.org/pdf/2407.04355v1.pdf","comment":"Accepted at MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.04353v1","updated":"2024-07-05T08:46:39Z","published":"2024-07-05T08:46:39Z","title":"Segmenting Medical Images: From UNet to Res-UNet and nnUNet","summary":" This study provides a comparative analysis of deep learning models including\nUNet, Res-UNet, Attention Res-UNet, and nnUNet, and evaluates their performance\nin brain tumour, polyp, and multi-class heart segmentation tasks. The analysis\nfocuses on precision, accuracy, recall, Dice Similarity Coefficient (DSC), and\nIntersection over Union (IoU) to assess their clinical applicability. In brain\ntumour segmentation, Res-UNet and nnUNet significantly outperformed UNet, with\nRes-UNet leading in DSC and IoU scores, indicating superior accuracy in tumour\ndelineation. Meanwhile, nnUNet excelled in recall and accuracy, which are\ncrucial for reliable tumour detection in clinical diagnosis and planning. In\npolyp detection, nnUNet was the most effective, achieving the highest metrics\nacross all categories and proving itself as a reliable diagnostic tool in\nendoscopy. In the complex task of heart segmentation, Res-UNet and Attention\nRes-UNet were outstanding in delineating the left ventricle, with Res-UNet also\nleading in right ventricle segmentation. nnUNet was unmatched in myocardium\nsegmentation, achieving top scores in precision, recall, DSC, and IoU. The\nconclusion notes that although Res-UNet occasionally outperforms nnUNet in\nspecific metrics, the differences are quite small. Moreover, nnUNet\nconsistently shows superior overall performance across the experiments.\nParticularly noted for its high recall and accuracy, which are crucial in\nclinical settings to minimize misdiagnosis and ensure timely treatment,\nnnUNet's robust performance in crucial metrics across all tested categories\nestablishes it as the most effective model for these varied and complex\nsegmentation tasks.\n","authors":["Lina Huang","Alina Miron","Kate Hone","Yongmin Li"],"pdf_url":"https://arxiv.org/pdf/2407.04353v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.04346v1","updated":"2024-07-05T08:37:10Z","published":"2024-07-05T08:37:10Z","title":"MobileFlow: A Multimodal LLM For Mobile GUI Agent","summary":" Currently, the integration of mobile Graphical User Interfaces (GUIs) is\nubiquitous in most people's daily lives. And the ongoing evolution of\nmultimodal large-scale models, such as GPT-4v, Qwen-VL-Max, has significantly\nbolstered the capabilities of GUI comprehension and user action analysis,\nshowcasing the potentiality of intelligent GUI assistants. However, current GUI\nAgents often need to access page layout information through calling system\nAPIs, which may pose privacy risks. Fixing GUI (such as mobile interfaces) to a\ncertain low resolution might result in the loss of fine-grained image details.\nAt the same time, the multimodal large models built for GUI Agents currently\nhave poor understanding and decision-making abilities for Chinese GUI\ninterfaces, making them difficult to apply to a large number of Chinese apps.\nThis paper introduces MobileFlow, a multimodal large language model\nmeticulously crafted for mobile GUI agents. Transforming from the open-source\nmodel Qwen-VL-Chat into GUI domain, MobileFlow contains approximately 21\nbillion parameters and is equipped with novel hybrid visual encoders, making it\npossible for variable resolutions of image inputs and good support for\nmultilingual GUI. By incorporating Mixture of Experts (MoE) expansions and\npioneering alignment training strategies, MobileFlow has the capacity to fully\ninterpret image data and comprehend user instructions for GUI interaction\ntasks. Finally, MobileFlow outperforms Qwen-VL-Max and GPT-4v in terms of task\nexecution by GUI agents on both public and our proposed evaluation metrics, and\nhas been successfully deployed in real-world business contexts, proving its\neffectiveness for practical applications.\n","authors":["Songqin Nong","Jiali Zhu","Rui Wu","Jiongchao Jin","Shuo Shan","Xiutian Huang","Wenhao Xu"],"pdf_url":"https://arxiv.org/pdf/2407.04346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04345v1","updated":"2024-07-05T08:36:26Z","published":"2024-07-05T08:36:26Z","title":"CanonicalFusion: Generating Drivable 3D Human Avatars from Multiple\n Images","summary":" We present a novel framework for reconstructing animatable human avatars from\nmultiple images, termed CanonicalFusion. Our central concept involves\nintegrating individual reconstruction results into the canonical space. To be\nspecific, we first predict Linear Blend Skinning (LBS) weight maps and depth\nmaps using a shared-encoder-dual-decoder network, enabling direct\ncanonicalization of the 3D mesh from the predicted depth maps. Here, instead of\npredicting high-dimensional skinning weights, we infer compressed skinning\nweights, i.e., 3-dimensional vector, with the aid of pre-trained MLP networks.\nWe also introduce a forward skinning-based differentiable rendering scheme to\nmerge the reconstructed results from multiple images. This scheme refines the\ninitial mesh by reposing the canonical mesh via the forward skinning and by\nminimizing photometric and geometric errors between the rendered and the\npredicted results. Our optimization scheme considers the position and color of\nvertices as well as the joint angles for each image, thereby mitigating the\nnegative effects of pose errors. We conduct extensive experiments to\ndemonstrate the effectiveness of our method and compare our CanonicalFusion\nwith state-of-the-art methods. Our source codes are available at\nhttps://github.com/jsshin98/CanonicalFusion.\n","authors":["Jisu Shin","Junmyeong Lee","Seongmin Lee","Min-Gyu Park","Ju-Mi Kang","Ju Hong Yoon","Hae-Gon Jeon"],"pdf_url":"https://arxiv.org/pdf/2407.04345v1.pdf","comment":"ECCV 2024 Accepted (18 pages, 9 figures)"},{"id":"http://arxiv.org/abs/2406.08282v2","updated":"2024-07-05T08:29:27Z","published":"2024-06-12T14:47:51Z","title":"Interpretable Representation Learning of Cardiac MRI via Attribute\n Regularization","summary":" Interpretability is essential in medical imaging to ensure that clinicians\ncan comprehend and trust artificial intelligence models. Several approaches\nhave been recently considered to encode attributes in the latent space to\nenhance its interpretability. Notably, attribute regularization aims to encode\na set of attributes along the dimensions of a latent representation. However,\nthis approach is based on Variational AutoEncoder and suffers from blurry\nreconstruction. In this paper, we propose an Attributed-regularized Soft\nIntrospective Variational Autoencoder that combines attribute regularization of\nthe latent space within the framework of an adversarially trained variational\nautoencoder. We demonstrate on short-axis cardiac Magnetic Resonance images of\nthe UK Biobank the ability of the proposed method to address blurry\nreconstruction issues of variational autoencoder methods while preserving the\nlatent space interpretability.\n","authors":["Maxime Di Folco","Cosmin I. Bercea","Emily Chan","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2406.08282v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2312.08915"},{"id":"http://arxiv.org/abs/2309.12325v2","updated":"2024-07-05T08:28:25Z","published":"2023-08-11T10:49:05Z","title":"FUTURE-AI: International consensus guideline for trustworthy and\n deployable artificial intelligence in healthcare","summary":" Despite major advances in artificial intelligence (AI) for medicine and\nhealthcare, the deployment and adoption of AI technologies remain limited in\nreal-world clinical practice. In recent years, concerns have been raised about\nthe technical, clinical, ethical and legal risks associated with medical AI. To\nincrease real world adoption, it is essential that medical AI tools are trusted\nand accepted by patients, clinicians, health organisations and authorities.\nThis work describes the FUTURE-AI guideline as the first international\nconsensus framework for guiding the development and deployment of trustworthy\nAI tools in healthcare. The FUTURE-AI consortium was founded in 2021 and\ncurrently comprises 118 inter-disciplinary experts from 51 countries\nrepresenting all continents, including AI scientists, clinicians, ethicists,\nand social scientists. Over a two-year period, the consortium defined guiding\nprinciples and best practices for trustworthy AI through an iterative process\ncomprising an in-depth literature review, a modified Delphi survey, and online\nconsensus meetings. The FUTURE-AI framework was established based on 6 guiding\nprinciples for trustworthy AI in healthcare, i.e. Fairness, Universality,\nTraceability, Usability, Robustness and Explainability. Through consensus, a\nset of 28 best practices were defined, addressing technical, clinical, legal\nand socio-ethical dimensions. The recommendations cover the entire lifecycle of\nmedical AI, from design, development and validation to regulation, deployment,\nand monitoring. FUTURE-AI is a risk-informed, assumption-free guideline which\nprovides a structured approach for constructing medical AI tools that will be\ntrusted, deployed and adopted in real-world practice. Researchers are\nencouraged to take the recommendations into account in proof-of-concept stages\nto facilitate future translation towards clinical practice of medical AI.\n","authors":["Karim Lekadir","Aasa Feragen","Abdul Joseph Fofanah","Alejandro F Frangi","Alena Buyx","Anais Emelie","Andrea Lara","Antonio R Porras","An-Wen Chan","Arcadi Navarro","Ben Glocker","Benard O Botwe","Bishesh Khanal","Brigit Beger","Carol C Wu","Celia Cintas","Curtis P Langlotz","Daniel Rueckert","Deogratias Mzurikwao","Dimitrios I Fotiadis","Doszhan Zhussupov","Enzo Ferrante","Erik Meijering","Eva Weicken","Fabio A González","Folkert W Asselbergs","Fred Prior","Gabriel P Krestin","Gary Collins","Geletaw S Tegenaw","Georgios Kaissis","Gianluca Misuraca","Gianna Tsakou","Girish Dwivedi","Haridimos Kondylakis","Harsha Jayakody","Henry C Woodruf","Hugo JWL Aerts","Ian Walsh","Ioanna Chouvarda","Irène Buvat","Islem Rekik","James Duncan","Jayashree Kalpathy-Cramer","Jihad Zahir","Jinah Park","John Mongan","Judy W Gichoya","Julia A Schnabel","Kaisar Kushibar","Katrine Riklund","Kensaku Mori","Kostas Marias","Lameck M Amugongo","Lauren A Fromont","Lena Maier-Hein","Leonor Cerdá Alberich","Leticia Rittner","Lighton Phiri","Linda Marrakchi-Kacem","Lluís Donoso-Bach","Luis Martí-Bonmatí","M Jorge Cardoso","Maciej Bobowicz","Mahsa Shabani","Manolis Tsiknakis","Maria A Zuluaga","Maria Bielikova","Marie-Christine Fritzsche","Marius George Linguraru","Markus Wenzel","Marleen De Bruijne","Martin G Tolsgaard","Marzyeh Ghassemi","Md Ashrafuzzaman","Melanie Goisauf","Mohammad Yaqub","Mohammed Ammar","Mónica Cano Abadía","Mukhtar M E Mahmoud","Mustafa Elattar","Nicola Rieke","Nikolaos Papanikolaou","Noussair Lazrak","Oliver Díaz","Olivier Salvado","Oriol Pujol","Ousmane Sall","Pamela Guevara","Peter Gordebeke","Philippe Lambin","Pieta Brown","Purang Abolmaesumi","Qi Dou","Qinghua Lu","Richard Osuala","Rose Nakasi","S Kevin Zhou","Sandy Napel","Sara Colantonio","Shadi Albarqouni","Smriti Joshi","Stacy Carter","Stefan Klein","Steffen E Petersen","Susanna Aussó","Suyash Awate","Tammy Riklin Raviv","Tessa Cook","Tinashe E M Mutsvangwa","Wendy A Rogers","Wiro J Niessen","Xènia Puig-Bosch","Yi Zeng","Yunusa G Mohammed","Yves Saint James Aquino","Zohaib Salahuddin","Martijn P A Starmans"],"pdf_url":"https://arxiv.org/pdf/2309.12325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04334v1","updated":"2024-07-05T08:19:36Z","published":"2024-07-05T08:19:36Z","title":"Learning Geometric Invariant Features for Classification of Vector\n Polygons with Graph Message-passing Neural Network","summary":" Geometric shape classification of vector polygons remains a non-trivial\nlearning task in spatial analysis. Previous studies mainly focus on devising\ndeep learning approaches for representation learning of rasterized vector\npolygons, whereas the study of discrete representations of polygons and\nsubsequent deep learning approaches have not been fully investigated. In this\nstudy, we investigate a graph representation of vector polygons and propose a\nnovel graph message-passing neural network (PolyMP) to learn the\ngeometric-invariant features for shape classification of polygons. Through\nextensive experiments, we show that the graph representation of polygons\ncombined with a permutation-invariant graph message-passing neural network\nachieves highly robust performances on benchmark datasets (i.e., synthetic\nglyph and real-world building footprint datasets) as compared to baseline\nmethods. We demonstrate that the proposed graph-based PolyMP network enables\nthe learning of expressive geometric features invariant to geometric\ntransformations of polygons (i.e., translation, rotation, scaling and shearing)\nand is robust to trivial vertex removals of polygons. We further show the\nstrong generalizability of PolyMP, which enables generalizing the learned\ngeometric features from the synthetic glyph polygons to the real-world building\nfootprints.\n","authors":["Zexian Huang","Kourosh Khoshelham","Martin Tomko"],"pdf_url":"https://arxiv.org/pdf/2407.04334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02846v2","updated":"2024-07-05T08:10:49Z","published":"2024-07-03T06:47:58Z","title":"Multi-Task Domain Adaptation for Language Grounding with 3D Objects","summary":" The existing works on object-level language grounding with 3D objects mostly\nfocus on improving performance by utilizing the off-the-shelf pre-trained\nmodels to capture features, such as viewpoint selection or geometric priors.\nHowever, they have failed to consider exploring the cross-modal representation\nof language-vision alignment in the cross-domain field. To answer this problem,\nwe propose a novel method called Domain Adaptation for Language Grounding\n(DA4LG) with 3D objects. Specifically, the proposed DA4LG consists of a visual\nadapter module with multi-task learning to realize vision-language alignment by\ncomprehensive multimodal feature representation. Experimental results\ndemonstrate that DA4LG competitively performs across visual and non-visual\nlanguage descriptions, independent of the completeness of observation. DA4LG\nachieves state-of-the-art performance in the single-view setting and multi-view\nsetting with the accuracy of 83.8% and 86.8% respectively in the language\ngrounding benchmark SNARE. The simulation experiments show the well-practical\nand generalized performance of DA4LG compared to the existing methods. Our\nproject is available at https://sites.google.com/view/da4lg.\n","authors":["Penglei Sun","Yaoxian Song","Xinglin Pan","Peijie Dong","Xiaofei Yang","Qiang Wang","Zhixu Li","Tiefeng Li","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2407.02846v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01732v5","updated":"2024-07-05T08:07:34Z","published":"2023-01-04T18:02:59Z","title":"Explicit Abnormality Extraction for Unsupervised Motion Artifact\n Reduction in Magnetic Resonance Imaging","summary":" Motion artifacts compromise the quality of magnetic resonance imaging (MRI)\nand pose challenges to achieving diagnostic outcomes and image-guided\ntherapies. In recent years, supervised deep learning approaches have emerged as\nsuccessful solutions for motion artifact reduction (MAR). One disadvantage of\nthese methods is their dependency on acquiring paired sets of motion\nartifact-corrupted (MA-corrupted) and motion artifact-free (MA-free) MR images\nfor training purposes. Obtaining such image pairs is difficult and therefore\nlimits the application of supervised training. In this paper, we propose a\nnovel UNsupervised Abnormality Extraction Network (UNAEN) to alleviate this\nproblem. Our network is capable of working with unpaired MA-corrupted and\nMA-free images. It converts the MA-corrupted images to MA-reduced images by\nextracting abnormalities from the MA-corrupted images using a proposed artifact\nextractor, which intercepts the residual artifact maps from the MA-corrupted MR\nimages explicitly, and a reconstructor to restore the original input from the\nMA-reduced images. The performance of UNAEN was assessed by experimenting with\nvarious publicly available MRI datasets and comparing them with\nstate-of-the-art methods. The quantitative evaluation demonstrates the\nsuperiority of UNAEN over alternative MAR methods and visually exhibits fewer\nresidual artifacts. Our results substantiate the potential of UNAEN as a\npromising solution applicable in real-world clinical environments, with the\ncapability to enhance diagnostic accuracy and facilitate image-guided\ntherapies. Our codes are publicly available at\nhttps://github.com/YuSheng-Zhou/UNAEN.\n","authors":["Yusheng Zhou","Hao Li","Jianan Liu","Zhengmin Kong","Tao Huang","Euijoon Ahn","Zhihan Lv","Jinman Kim","David Dagan Feng"],"pdf_url":"https://arxiv.org/pdf/2301.01732v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04327v1","updated":"2024-07-05T07:55:19Z","published":"2024-07-05T07:55:19Z","title":"TF-SASM: Training-free Spatial-aware Sparse Memory for Multi-object\n Tracking","summary":" Multi-object tracking (MOT) in computer vision remains a significant\nchallenge, requiring precise localization and continuous tracking of multiple\nobjects in video sequences. This task is crucial for various applications,\nincluding action recognition and behavior analysis. Key challenges include\nocclusion, reidentification, tracking fast-moving objects, and handling camera\nmotion artifacts. Past research has explored tracking-by-detection methods and\nend-to-end models, with recent attention on tracking-by-attention approaches\nleveraging transformer architectures. The emergence of data sets that emphasize\nrobust reidentification, such as DanceTrack, has highlighted the need for\neffective solutions. While memory-based approaches have shown promise, they\noften suffer from high computational complexity and memory usage. We propose a\nnovel sparse memory approach that selectively stores critical features based on\nobject motion and overlapping awareness, aiming to enhance efficiency while\nminimizing redundancy. Building upon the MOTRv2 model, a hybrid of\ntracking-by-attention and tracking-by-detection, we introduce a training-free\nmemory designed to bolster reidentification capabilities and preserve the\nmodel's flexibility. Our memory approach achieves significant improvements over\nMOTRv2 in the DanceTrack test set, demonstrating a gain of 1.1\\% in HOTA\nmetrics and 2.1\\% in IDF1 score.\n","authors":["Thuc Nguyen-Quang","Minh-Triet Tran"],"pdf_url":"https://arxiv.org/pdf/2407.04327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04326v1","updated":"2024-07-05T07:55:06Z","published":"2024-07-05T07:55:06Z","title":"LMSeg: A deep graph message-passing network for efficient and accurate\n semantic segmentation of large-scale 3D landscape meshes","summary":" Semantic segmentation of large-scale 3D landscape meshes is pivotal for\nvarious geospatial applications, including spatial analysis, automatic mapping\nand localization of target objects, and urban planning and development. This\nrequires an efficient and accurate 3D perception system to understand and\nanalyze real-world environments. However, traditional mesh segmentation methods\nface challenges in accurately segmenting small objects and maintaining\ncomputational efficiency due to the complexity and large size of 3D landscape\nmesh datasets. This paper presents an end-to-end deep graph message-passing\nnetwork, LMSeg, designed to efficiently and accurately perform semantic\nsegmentation on large-scale 3D landscape meshes. The proposed approach takes\nthe barycentric dual graph of meshes as inputs and applies deep message-passing\nneural networks to hierarchically capture the geometric and spatial features\nfrom the barycentric graph structures and learn intricate semantic information\nfrom textured meshes. The hierarchical and local pooling of the barycentric\ngraph, along with the effective geometry aggregation modules of LMSeg, enable\nfast inference and accurate segmentation of small-sized and irregular mesh\nobjects in various complex landscapes. Extensive experiments on two benchmark\ndatasets (natural and urban landscapes) demonstrate that LMSeg significantly\noutperforms existing learning-based segmentation methods in terms of object\nsegmentation accuracy and computational efficiency. Furthermore, our method\nexhibits strong generalization capabilities across diverse landscapes and\ndemonstrates robust resilience against varying mesh densities and landscape\ntopologies.\n","authors":["Zexian Huang","Kourosh Khoshelham","Gunditj Mirring Traditional Owners Corporation","Martin Tomko"],"pdf_url":"https://arxiv.org/pdf/2407.04326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10609v2","updated":"2024-07-05T07:49:01Z","published":"2024-02-16T11:54:34Z","title":"MRPD: Undersampled MRI reconstruction by prompting a large latent\n diffusion model","summary":" Implicit visual knowledge in a large latent diffusion model (LLDM)\npre-trained on natural images is rich and hypothetically universal to natural\nand medical images. To test this hypothesis from a practical perspective, we\npropose a novel framework for undersampled MRI Reconstruction by Prompting a\nlarge latent Diffusion model (MRPD). While the existing methods trained on MRI\ndatasets are typically of limited generalizability toward diverse data\nacquisition scenarios, MRPD supports unsupervised and universally adaptive MRI\nreconstruction. For unsupervised reconstruction, MRSampler guides LLDM with a\nrandom-phase-modulated hard-to-soft control. With any single- or\nmultiple-source MRI dataset, MRPD's performance is boosted universally by a\nlightweight MRAdapter that only finetunes the LLDM's autoencoder. Experiments\non FastMRI and IXI show that MRPD is the only model that supports both MRI\ndatabase-free and database-available scenarios and attains the best\ngeneralizability towards out-of-domain (OOD) samplings, contrasts, and organs\namong compared unsupervised, supervised, and MRI diffusion methods. To our\nknowledge, MRPD is the first method that empirically shows the universal\nprowess of an LLDM pre-trained on vast natural images for MRI. Our official\nimplementation is at https://github.com/Z7Gao/MRPD.\n","authors":["Ziqi Gao","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.10609v2.pdf","comment":"10 pages, 5 figures, 7 tables, 1 pseudocode"},{"id":"http://arxiv.org/abs/2405.01199v2","updated":"2024-07-05T07:34:51Z","published":"2024-05-02T11:40:44Z","title":"Latent Fingerprint Matching via Dense Minutia Descriptor","summary":" Latent fingerprint matching is a daunting task, primarily due to the poor\nquality of latent fingerprints. In this study, we propose a deep-learning based\ndense minutia descriptor (DMD) for latent fingerprint matching. A DMD is\nobtained by extracting the fingerprint patch aligned by its central minutia,\ncapturing detailed minutia information and texture information. Our dense\ndescriptor takes the form of a three-dimensional representation, with two\ndimensions associated with the original image plane and the other dimension\nrepresenting the abstract features. Additionally, the extraction process\noutputs the fingerprint segmentation map, ensuring that the descriptor is only\nvalid in the foreground region. The matching between two descriptors occurs in\ntheir overlapping regions, with a score normalization strategy to reduce the\nimpact brought by the differences outside the valid area. Our descriptor\nachieves state-of-the-art performance on several latent fingerprint datasets.\nOverall, our DMD is more representative and interpretable compared to previous\nmethods.\n","authors":["Zhiyu Pan","Yongjie Duan","Xiongjun Guan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.01199v2.pdf","comment":"accepted by IJCB 2024"},{"id":"http://arxiv.org/abs/2401.16520v2","updated":"2024-07-05T07:32:01Z","published":"2024-01-29T19:50:50Z","title":"MT-HCCAR: Multi-Task Deep Learning with Hierarchical Classification and\n Attention-based Regression for Cloud Property Retrieval","summary":" In the realm of Earth science, effective cloud property retrieval,\nencompassing cloud masking, cloud phase classification, and cloud optical\nthickness (COT) prediction, remains pivotal. Traditional methodologies\nnecessitate distinct models for each sensor instrument due to their unique\nspectral characteristics. Recent strides in Earth Science research have\nembraced machine learning and deep learning techniques to extract features from\nsatellite datasets' spectral observations. However, prevailing approaches lack\nnovel architectures accounting for hierarchical relationships among retrieval\ntasks. Moreover, considering the spectral diversity among existing sensors, the\ndevelopment of models with robust generalization capabilities over different\nsensor datasets is imperative. Surprisingly, there is a dearth of methodologies\naddressing the selection of an optimal model for diverse datasets. In response,\nthis paper introduces MT-HCCAR, an end-to-end deep learning model employing\nmulti-task learning to simultaneously tackle cloud masking, cloud phase\nretrieval (classification tasks), and COT prediction (a regression task). The\nMT-HCCAR integrates a hierarchical classification network (HC) and a\nclassification-assisted attention-based regression network (CAR), enhancing\nprecision and robustness in cloud labeling and COT prediction. Additionally, a\ncomprehensive model selection method rooted in K-fold cross-validation, one\nstandard error rule, and two introduced performance scores is proposed to\nselect the optimal model over three simulated satellite datasets OCI, VIIRS,\nand ABI. The experiments comparing MT-HCCAR with baseline methods, the ablation\nstudies, and the model selection affirm the superiority and the generalization\ncapabilities of MT-HCCAR.\n","authors":["Xingyan Li","Andrew M. Sayer","Ian T. Carroll","Xin Huang","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2401.16520v2.pdf","comment":"14 pages, 3 figures, accepted by ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2407.04308v1","updated":"2024-07-05T07:23:51Z","published":"2024-07-05T07:23:51Z","title":"SSP-GNN: Learning to Track via Bilevel Optimization","summary":" We propose a graph-based tracking formulation for multi-object tracking (MOT)\nwhere target detections contain kinematic information and re-identification\nfeatures (attributes). Our method applies a successive shortest paths (SSP)\nalgorithm to a tracking graph defined over a batch of frames. The edge costs in\nthis tracking graph are computed via a message-passing network, a graph neural\nnetwork (GNN) variant. The parameters of the GNN, and hence, the tracker, are\nlearned end-to-end on a training set of example ground-truth tracks and\ndetections. Specifically, learning takes the form of bilevel optimization\nguided by our novel loss function. We evaluate our algorithm on simulated\nscenarios to understand its sensitivity to scenario aspects and model\nhyperparameters. Across varied scenario complexities, our method compares\nfavorably to a strong baseline.\n","authors":["Griffin Golias","Masa Nakura-Fan","Vitaly Ablavsky"],"pdf_url":"https://arxiv.org/pdf/2407.04308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04305v1","updated":"2024-07-05T07:17:58Z","published":"2024-07-05T07:17:58Z","title":"Towards Stable 3D Object Detection","summary":" In autonomous driving, the temporal stability of 3D object detection greatly\nimpacts the driving safety. However, the detection stability cannot be accessed\nby existing metrics such as mAP and MOTA, and consequently is less explored by\nthe community. To bridge this gap, this work proposes Stability Index (SI), a\nnew metric that can comprehensively evaluate the stability of 3D detectors in\nterms of confidence, box localization, extent, and heading. By benchmarking\nstate-of-the-art object detectors on the Waymo Open Dataset, SI reveals\ninteresting properties of object stability that have not been previously\ndiscovered by other metrics. To help models improve their stability, we further\nintroduce a general and effective training strategy, called Prediction\nConsistency Learning (PCL). PCL essentially encourages the prediction\nconsistency of the same objects under different timestamps and augmentations,\nleading to enhanced detection stability. Furthermore, we examine the\neffectiveness of PCL with the widely-used CenterPoint, and achieve a remarkable\nSI of 86.00 for vehicle class, surpassing the baseline by 5.48. We hope our\nwork could serve as a reliable baseline and draw the community's attention to\nthis crucial issue in 3D object detection. Codes will be made publicly\navailable.\n","authors":["Jiabao Wang","Qiang Meng","Guochao Liu","Liujiang Yan","Ke Wang","Ming-Ming Cheng","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2407.04305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04287v1","updated":"2024-07-05T06:44:43Z","published":"2024-07-05T06:44:43Z","title":"MARS: Paying more attention to visual attributes for text-based person\n search","summary":" Text-based person search (TBPS) is a problem that gained significant interest\nwithin the research community. The task is that of retrieving one or more\nimages of a specific individual based on a textual description. The multi-modal\nnature of the task requires learning representations that bridge text and image\ndata within a shared latent space. Existing TBPS systems face two major\nchallenges. One is defined as inter-identity noise that is due to the inherent\nvagueness and imprecision of text descriptions and it indicates how\ndescriptions of visual attributes can be generally associated to different\npeople; the other is the intra-identity variations, which are all those\nnuisances e.g. pose, illumination, that can alter the visual appearance of the\nsame textual attributes for a given subject. To address these issues, this\npaper presents a novel TBPS architecture named MARS\n(Mae-Attribute-Relation-Sensitive), which enhances current state-of-the-art\nmodels by introducing two key components: a Visual Reconstruction Loss and an\nAttribute Loss. The former employs a Masked AutoEncoder trained to reconstruct\nrandomly masked image patches with the aid of the textual description. In doing\nso the model is encouraged to learn more expressive representations and\ntextual-visual relations in the latent space. The Attribute Loss, instead,\nbalances the contribution of different types of attributes, defined as\nadjective-noun chunks of text. This loss ensures that every attribute is taken\ninto consideration in the person retrieval process. Extensive experiments on\nthree commonly used datasets, namely CUHK-PEDES, ICFG-PEDES, and RSTPReid,\nreport performance improvements, with significant gains in the mean Average\nPrecision (mAP) metric w.r.t. the current state of the art.\n","authors":["Alex Ergasti","Tomaso Fontanini","Claudio Ferrari","Massimo Bertozzi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2407.04287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12359v3","updated":"2024-07-05T06:26:45Z","published":"2023-11-21T05:27:16Z","title":"Shedding the Bits: Pushing the Boundaries of Quantization with\n Minifloats on FPGAs","summary":" Post-training quantization (PTQ) is a powerful technique for model\ncompression, reducing the numerical precision in neural networks without\nadditional training overhead. Recent works have investigated adopting 8-bit\nfloating-point formats(FP8) in the context of PTQ for model inference. However,\nfloating-point formats smaller than 8 bits and their relative comparison in\nterms of accuracy-hardware cost with integers remains unexplored on FPGAs. In\nthis work, we present minifloats, which are reduced-precision floating-point\nformats capable of further reducing the memory footprint, latency, and energy\ncost of a model while approaching full-precision model accuracy. We implement a\ncustom FPGA-based multiply-accumulate operator library and explore the vast\ndesign space, comparing minifloat and integer representations across 3 to 8\nbits for both weights and activations. We also examine the applicability of\nvarious integerbased quantization techniques to minifloats. Our experiments\nshow that minifloats offer a promising alternative for emerging workloads such\nas vision transformers.\n","authors":["Shivam Aggarwal","Hans Jakob Damsgaard","Alessandro Pappalardo","Giuseppe Franco","Thomas B. Preußer","Michaela Blott","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.12359v3.pdf","comment":"Accepted in FPL (International Conference on Field-Programmable Logic\n and Applications) 2024 conference. Revised with updated results"},{"id":"http://arxiv.org/abs/2407.04277v1","updated":"2024-07-05T06:17:00Z","published":"2024-07-05T06:17:00Z","title":"Research, Applications and Prospects of Event-Based Pedestrian\n Detection: A Survey","summary":" Event-based cameras, inspired by the biological retina, have evolved into\ncutting-edge sensors distinguished by their minimal power requirements,\nnegligible latency, superior temporal resolution, and expansive dynamic range.\nAt present, cameras used for pedestrian detection are mainly frame-based\nimaging sensors, which have suffered from lethargic response times and hefty\ndata redundancy. In contrast, event-based cameras address these limitations by\neschewing extraneous data transmissions and obviating motion blur in high-speed\nimaging scenarios. On pedestrian detection via event-based cameras, this paper\noffers an exhaustive review of research and applications particularly in the\nautonomous driving context. Through methodically scrutinizing relevant\nliterature, the paper outlines the foundational principles, developmental\ntrajectory, and the comparative merits and demerits of eventbased detection\nrelative to traditional frame-based methodologies. This review conducts\nthorough analyses of various event stream inputs and their corresponding\nnetwork models to evaluate their applicability across diverse operational\nenvironments. It also delves into pivotal elements such as crucial datasets and\ndata acquisition techniques essential for advancing this technology, as well as\nadvanced algorithms for processing event stream data. Culminating with a\nsynthesis of the extant landscape, the review accentuates the unique advantages\nand persistent challenges inherent in event-based pedestrian detection,\noffering a prognostic view on potential future developments in this\nfast-progressing field.\n","authors":["Han Wang","Yuman Nie","Yun Li","Hongjie Liu","Min Liu","Wen Cheng","Yaoxiong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04274v1","updated":"2024-07-05T06:02:46Z","published":"2024-07-05T06:02:46Z","title":"Fine-grained Dynamic Network for Generic Event Boundary Detection","summary":" Generic event boundary detection (GEBD) aims at pinpointing event boundaries\nnaturally perceived by humans, playing a crucial role in understanding\nlong-form videos. Given the diverse nature of generic boundaries, spanning\ndifferent video appearances, objects, and actions, this task remains\nchallenging. Existing methods usually detect various boundaries by the same\nprotocol, regardless of their distinctive characteristics and detection\ndifficulties, resulting in suboptimal performance. Intuitively, a more\nintelligent and reasonable way is to adaptively detect boundaries by\nconsidering their special properties. In light of this, we propose a novel\ndynamic pipeline for generic event boundaries named DyBDet. By introducing a\nmulti-exit network architecture, DyBDet automatically learns the subnet\nallocation to different video snippets, enabling fine-grained detection for\nvarious boundaries. Besides, a multi-order difference detector is also proposed\nto ensure generic boundaries can be effectively identified and adaptively\nprocessed. Extensive experiments on the challenging Kinetics-GEBD and TAPOS\ndatasets demonstrate that adopting the dynamic strategy significantly benefits\nGEBD tasks, leading to obvious improvements in both performance and efficiency\ncompared to the current state-of-the-art.\n","authors":["Ziwei Zheng","Lijun He","Le Yang","Fan Li"],"pdf_url":"https://arxiv.org/pdf/2407.04274v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2407.04271v1","updated":"2024-07-05T05:52:51Z","published":"2024-07-05T05:52:51Z","title":"Variational Partial Group Convolutions for Input-Aware Partial\n Equivariance of Rotations and Color-Shifts","summary":" Group Equivariant CNNs (G-CNNs) have shown promising efficacy in various\ntasks, owing to their ability to capture hierarchical features in an\nequivariant manner. However, their equivariance is fixed to the symmetry of the\nwhole group, limiting adaptability to diverse partial symmetries in real-world\ndatasets, such as limited rotation symmetry of handwritten digit images and\nlimited color-shift symmetry of flower images. Recent efforts address this\nlimitation, one example being Partial G-CNN which restricts the output group\nspace of convolution layers to break full equivariance. However, such an\napproach still fails to adjust equivariance levels across data. In this paper,\nwe propose a novel approach, Variational Partial G-CNN (VP G-CNN), to capture\nvarying levels of partial equivariance specific to each data instance. VP G-CNN\nredesigns the distribution of the output group elements to be conditioned on\ninput data, leveraging variational inference to avoid overfitting. This enables\nthe model to adjust its equivariance levels according to the needs of\nindividual data points. Additionally, we address training instability inherent\nin discrete group equivariance models by redesigning the reparametrizable\ndistribution. We demonstrate the effectiveness of VP G-CNN on both toy and\nreal-world datasets, including MNIST67-180, CIFAR10, ColorMNIST, and\nFlowers102. Our results show robust performance, even in uncertainty metrics.\n","authors":["Hyunsu Kim","Yegon Kim","Hongseok Yang","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2407.04271v1.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2407.04265v1","updated":"2024-07-05T05:38:20Z","published":"2024-07-05T05:38:20Z","title":"Parametric Curve Segment Extraction by Support Regions","summary":" We introduce a method to extract curve segments in parametric form from the\nimage directly using the Laplacian of Gaussian (LoG) filter response. Our\nsegmentation gives convex and concave curves. To do so, we form curve support\nregions by grouping pixels of the thresholded filter response. Then, we model\neach support region boundary by Fourier series and extract the corresponding\nparametric curve segment.\n","authors":["Cem Ünsalan"],"pdf_url":"https://arxiv.org/pdf/2407.04265v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.04260v1","updated":"2024-07-05T05:19:53Z","published":"2024-07-05T05:19:53Z","title":"Efficient Detection of Long Consistent Cycles and its Application to\n Distributed Synchronization","summary":" Group synchronization plays a crucial role in global pipelines for Structure\nfrom Motion (SfM). Its formulation is nonconvex and it is faced with highly\ncorrupted measurements. Cycle consistency has been effective in addressing\nthese challenges. However, computationally efficient solutions are needed for\ncycles longer than three, especially in practical scenarios where 3-cycles are\nunavailable. To overcome this computational bottleneck, we propose an algorithm\nfor group synchronization that leverages information from cycles of lengths\nranging from three to six with a time complexity of order $O(n^3)$ (or\n$O(n^{2.373})$ when using a faster matrix multiplication algorithm). We\nestablish non-trivial theory for this and related methods that achieves\ncompetitive sample complexity, assuming the uniform corruption model. To\nadvocate the practical need for our method, we consider distributed group\nsynchronization, which requires at least 4-cycles, and we illustrate\nstate-of-the-art performance by our method in this context.\n","authors":["Shaohan Li","Yunpeng Shi","Gilad Lerman"],"pdf_url":"https://arxiv.org/pdf/2407.04260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04258v1","updated":"2024-07-05T05:08:06Z","published":"2024-07-05T05:08:06Z","title":"Unsupervised Video Summarization via Reinforcement Learning and a\n Trained Evaluator","summary":" This paper presents a novel approach for unsupervised video summarization\nusing reinforcement learning. It aims to address the existing limitations of\ncurrent unsupervised methods, including unstable training of adversarial\ngenerator-discriminator architectures and reliance on hand-crafted reward\nfunctions for quality evaluation. The proposed method is based on the concept\nthat a concise and informative summary should result in a reconstructed video\nthat closely resembles the original. The summarizer model assigns an importance\nscore to each frame and generates a video summary. In the proposed scheme,\nreinforcement learning, coupled with a unique reward generation pipeline, is\nemployed to train the summarizer model. The reward generation pipeline trains\nthe summarizer to create summaries that lead to improved reconstructions. It\ncomprises a generator model capable of reconstructing masked frames from a\npartially masked video, along with a reward mechanism that compares the\nreconstructed video from the summary against the original. The video generator\nis trained in a self-supervised manner to reconstruct randomly masked frames,\nenhancing its ability to generate accurate summaries. This training pipeline\nresults in a summarizer model that better mimics human-generated video\nsummaries compared to methods relying on hand-crafted rewards. The training\nprocess consists of two stable and isolated training steps, unlike adversarial\narchitectures. Experimental results demonstrate promising performance, with\nF-scores of 62.3 and 54.5 on TVSum and SumMe datasets, respectively.\nAdditionally, the inference stage is 300 times faster than our previously\nreported state-of-the-art method.\n","authors":["Mehryar Abbasi","Hadi Hadizadeh","Parvaneh Saeedi"],"pdf_url":"https://arxiv.org/pdf/2407.04258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02147v2","updated":"2024-07-05T05:07:08Z","published":"2023-12-04T18:59:20Z","title":"Rejuvenating image-GPT as Strong Visual Representation Learners","summary":" This paper enhances image-GPT (iGPT), one of the pioneering works that\nintroduce autoregressive pretraining to predict the next pixels for visual\nrepresentation learning. Two simple yet essential changes are made. First, we\nshift the prediction target from raw pixels to semantic tokens, enabling a\nhigher-level understanding of visual content. Second, we supplement the\nautoregressive modeling by instructing the model to predict not only the next\ntokens but also the visible tokens. This pipeline is particularly effective\nwhen semantic tokens are encoded by discriminatively trained models, such as\nCLIP. We introduce this novel approach as D-iGPT. Extensive experiments\nshowcase that D-iGPT excels as a strong learner of visual representations: A\nnotable achievement is its compelling performance on the ImageNet-1K dataset --\nby training on publicly available datasets, D-iGPT unprecedentedly achieves\n\\textbf{90.0\\%} top-1 accuracy with a vanilla ViT-H. Additionally, D-iGPT shows\nstrong generalization on the downstream task. Code is available at\nhttps://github.com/OliverRensu/D-iGPT.\n","authors":["Sucheng Ren","Zeyu Wang","Hongru Zhu","Junfei Xiao","Alan Yuille","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.02147v2.pdf","comment":"This paper is accepted by ICML2024"},{"id":"http://arxiv.org/abs/2407.04255v1","updated":"2024-07-05T04:56:05Z","published":"2024-07-05T04:56:05Z","title":"Second Place Solution of WSDM2023 Toloka Visual Question Answering\n Challenge","summary":" In this paper, we present our solution for the WSDM2023 Toloka Visual\nQuestion Answering Challenge. Inspired by the application of multimodal\npre-trained models to various downstream tasks(e.g., visual question answering,\nvisual grounding, and cross-modal retrieval), we approached this competition as\na visual grounding task, where the input is an image and a question, guiding\nthe model to answer the question and display the answer as a bounding box on\nthe image. We designed a three-stage solution for this task. Specifically, we\nused the visual-language pre-trained model OFA as the foundation. In the first\nstage, we constructed a large-scale synthetic dataset similar to the\ncompetition dataset and coarse-tuned the model to learn generalized semantic\ninformation. In the second stage, we treated the competition task as a visual\ngrounding task, loaded the weights from the previous stage, and continued to\nfine-tune the model on the competition dataset, transferring the semantic\ninformation learned in the first stage to the competition task. Finally, we\ndesigned a bounding box matching and replacing post-processing strategy to\ncorrect the model's prediction results. Our team achieved a score of 76.342 on\nthe final leaderboard, ranking second.\n","authors":["Xiangyu Wu","Zhouyang Chi","Yang Yang","Jianfeng Lu"],"pdf_url":"https://arxiv.org/pdf/2407.04255v1.pdf","comment":"Second Place of WSDM2023 Toloka Visual Question Answering Challenge"},{"id":"http://arxiv.org/abs/2407.04249v1","updated":"2024-07-05T04:37:39Z","published":"2024-07-05T04:37:39Z","title":"FeatureSORT: Essential Features for Effective Tracking","summary":" In this work, we introduce a novel tracker designed for online multiple\nobject tracking with a focus on being simple, while being effective. we provide\nmultiple feature modules each of which stands for a particular appearance\ninformation. By integrating distinct appearance features, including clothing\ncolor, style, and target direction, alongside a ReID network for robust\nembedding extraction, our tracker significantly enhances online tracking\naccuracy. Additionally, we propose the incorporation of a stronger detector and\nalso provide an advanced post processing methods that further elevate the\ntracker's performance. During real time operation, we establish measurement to\ntrack associated distance function which includes the IoU, direction, color,\nstyle, and ReID features similarity information, where each metric is\ncalculated separately. With the design of our feature related distance\nfunction, it is possible to track objects through longer period of occlusions,\nwhile keeping the number of identity switches comparatively low. Extensive\nexperimental evaluation demonstrates notable improvement in tracking accuracy\nand reliability, as evidenced by reduced identity switches and enhanced\nocclusion handling. These advancements not only contribute to the state of the\nart in object tracking but also open new avenues for future research and\npractical applications demanding high precision and reliability.\n","authors":["Hamidreza Hashempoor","Rosemary Koikara","Yu Dong Hwang"],"pdf_url":"https://arxiv.org/pdf/2407.04249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04247v1","updated":"2024-07-05T04:28:46Z","published":"2024-07-05T04:28:46Z","title":"ArAIEval Shared Task: Propagandistic Techniques Detection in Unimodal\n and Multimodal Arabic Content","summary":" We present an overview of the second edition of the ArAIEval shared task,\norganized as part of the ArabicNLP 2024 conference co-located with ACL 2024. In\nthis edition, ArAIEval offers two tasks: (i) detection of propagandistic\ntextual spans with persuasion techniques identification in tweets and news\narticles, and (ii) distinguishing between propagandistic and non-propagandistic\nmemes. A total of 14 teams participated in the final evaluation phase, with 6\nand 9 teams participating in Tasks 1 and 2, respectively. Finally, 11 teams\nsubmitted system description papers. Across both tasks, we observed that\nfine-tuning transformer models such as AraBERT was at the core of the majority\nof the participating systems. We provide a description of the task setup,\nincluding a description of the dataset construction and the evaluation setup.\nWe further provide a brief overview of the participating systems. All datasets\nand evaluation scripts are released to the research community\n(https://araieval.gitlab.io/). We hope this will enable further research on\nthese important tasks in Arabic.\n","authors":["Maram Hasanain","Md. Arid Hasan","Fatema Ahmed","Reem Suwaileh","Md. Rafiul Biswas","Wajdi Zaghouani","Firoj Alam"],"pdf_url":"https://arxiv.org/pdf/2407.04247v1.pdf","comment":"propaganda, span detection, disinformation, misinformation, fake\n news, LLMs, GPT-4, multimodality, multimodal LLMs"},{"id":"http://arxiv.org/abs/2407.04245v1","updated":"2024-07-05T04:14:50Z","published":"2024-07-05T04:14:50Z","title":"Every Pixel Has its Moments: Ultra-High-Resolution Unpaired\n Image-to-Image Translation via Dense Normalization","summary":" Recent advancements in ultra-high-resolution unpaired image-to-image\ntranslation have aimed to mitigate the constraints imposed by limited GPU\nmemory through patch-wise inference. Nonetheless, existing methods often\ncompromise between the reduction of noticeable tiling artifacts and the\npreservation of color and hue contrast, attributed to the reliance on global\nimage- or patch-level statistics in the instance normalization layers. In this\nstudy, we introduce a Dense Normalization (DN) layer designed to estimate\npixel-level statistical moments. This approach effectively diminishes tiling\nartifacts while concurrently preserving local color and hue contrasts. To\naddress the computational demands of pixel-level estimation, we further propose\nan efficient interpolation algorithm. Moreover, we invent a parallelism\nstrategy that enables the DN layer to operate in a single pass. Through\nextensive experiments, we demonstrate that our method surpasses all existing\napproaches in performance. Notably, our DN layer is hyperparameter-free and can\nbe seamlessly integrated into most unpaired image-to-image translation\nframeworks without necessitating retraining. Overall, our work paves the way\nfor future exploration in handling images of arbitrary resolutions within the\nrealm of unpaired image-to-image translation. Code is available at:\nhttps://github.com/Kaminyou/Dense-Normalization.\n","authors":["Ming-Yang Ho","Che-Ming Wu","Min-Sheng Wu","Yufeng Jane Tseng"],"pdf_url":"https://arxiv.org/pdf/2407.04245v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.04243v1","updated":"2024-07-05T04:11:09Z","published":"2024-07-05T04:11:09Z","title":"Exploration of Class Center for Fine-Grained Visual Classification","summary":" Different from large-scale classification tasks, fine-grained visual\nclassification is a challenging task due to two critical problems: 1) evident\nintra-class variances and subtle inter-class differences, and 2) overfitting\nowing to fewer training samples in datasets. Most existing methods extract key\nfeatures to reduce intra-class variances, but pay no attention to subtle\ninter-class differences in fine-grained visual classification. To address this\nissue, we propose a loss function named exploration of class center, which\nconsists of a multiple class-center constraint and a class-center label\ngeneration. This loss function fully utilizes the information of the class\ncenter from the perspective of features and labels. From the feature\nperspective, the multiple class-center constraint pulls samples closer to the\ntarget class center, and pushes samples away from the most similar nontarget\nclass center. Thus, the constraint reduces intra-class variances and enlarges\ninter-class differences. From the label perspective, the class-center label\ngeneration utilizes classcenter distributions to generate soft labels to\nalleviate overfitting. Our method can be easily integrated with existing\nfine-grained visual classification approaches as a loss function, to further\nboost excellent performance with only slight training costs. Extensive\nexperiments are conducted to demonstrate consistent improvements achieved by\nour method on four widely-used fine-grained visual classification datasets. In\nparticular, our method achieves state-of-the-art performance on the\nFGVC-Aircraft and CUB-200-2011 datasets.\n","authors":["Hang Yao","Qiguang Miao","Peipei Zhao","Chaoneng Li","Xin Li","Guanwen Feng","Ruyi Liu"],"pdf_url":"https://arxiv.org/pdf/2407.04243v1.pdf","comment":"Accpeted by TCSVT. Code and trained models are\n here:https://github.com/hyao1/ECC"},{"id":"http://arxiv.org/abs/2403.05023v2","updated":"2024-07-05T04:10:06Z","published":"2024-03-08T03:55:27Z","title":"Towards Multimodal Sentiment Analysis Debiasing via Bias Purification","summary":" Multimodal Sentiment Analysis (MSA) aims to understand human intentions by\nintegrating emotion-related clues from diverse modalities, such as visual,\nlanguage, and audio. Unfortunately, the current MSA task invariably suffers\nfrom unplanned dataset biases, particularly multimodal utterance-level label\nbias and word-level context bias. These harmful biases potentially mislead\nmodels to focus on statistical shortcuts and spurious correlations, causing\nsevere performance bottlenecks. To alleviate these issues, we present a\nMultimodal Counterfactual Inference Sentiment (MCIS) analysis framework based\non causality rather than conventional likelihood. Concretely, we first\nformulate a causal graph to discover harmful biases from already-trained\nvanilla models. In the inference phase, given a factual multimodal input, MCIS\nimagines two counterfactual scenarios to purify and mitigate these biases.\nThen, MCIS can make unbiased decisions from biased observations by comparing\nfactual and counterfactual outcomes. We conduct extensive experiments on\nseveral standard MSA benchmarks. Qualitative and quantitative results show the\neffectiveness of the proposed framework.\n","authors":["Dingkang Yang","Mingcheng Li","Dongling Xiao","Yang Liu","Kun Yang","Zhaoyu Chen","Yuzheng Wang","Peng Zhai","Ke Li","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05023v2.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2404.15264v2","updated":"2024-07-05T04:09:46Z","published":"2024-04-23T17:55:07Z","title":"TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via\n Gaussian Splatting","summary":" Radiance fields have demonstrated impressive performance in synthesizing\nlifelike 3D talking heads. However, due to the difficulty in fitting steep\nappearance changes, the prevailing paradigm that presents facial motions by\ndirectly modifying point appearance may lead to distortions in dynamic regions.\nTo tackle this challenge, we introduce TalkingGaussian, a deformation-based\nradiance fields framework for high-fidelity talking head synthesis. Leveraging\nthe point-based Gaussian Splatting, facial motions can be represented in our\nmethod by applying smooth and continuous deformations to persistent Gaussian\nprimitives, without requiring to learn the difficult appearance change like\nprevious methods. Due to this simplification, precise facial motions can be\nsynthesized while keeping a highly intact facial feature. Under such a\ndeformation paradigm, we further identify a face-mouth motion inconsistency\nthat would affect the learning of detailed speaking motions. To address this\nconflict, we decompose the model into two branches separately for the face and\ninside mouth areas, therefore simplifying the learning tasks to help\nreconstruct more accurate motion and structure of the mouth region. Extensive\nexperiments demonstrate that our method renders high-quality lip-synchronized\ntalking head videos, with better facial fidelity and higher efficiency compared\nwith previous methods.\n","authors":["Jiahe Li","Jiawei Zhang","Xiao Bai","Jin Zheng","Xin Ning","Jun Zhou","Lin Gu"],"pdf_url":"https://arxiv.org/pdf/2404.15264v2.pdf","comment":"Accepted at ECCV 2024. Project page:\n https://fictionarry.github.io/TalkingGaussian/"},{"id":"http://arxiv.org/abs/2407.04242v1","updated":"2024-07-05T04:09:30Z","published":"2024-07-05T04:09:30Z","title":"Fine-grained Context and Multi-modal Alignment for Freehand 3D\n Ultrasound Reconstruction","summary":" Fine-grained spatio-temporal learning is crucial for freehand 3D ultrasound\nreconstruction. Previous works mainly resorted to the coarse-grained spatial\nfeatures and the separated temporal dependency learning and struggles for\nfine-grained spatio-temporal learning. Mining spatio-temporal information in\nfine-grained scales is extremely challenging due to learning difficulties in\nlong-range dependencies. In this context, we propose a novel method to exploit\nthe long-range dependency management capabilities of the state space model\n(SSM) to address the above challenge. Our contribution is three-fold. First, we\npropose ReMamba, which mines multi-scale spatio-temporal information by\ndevising a multi-directional SSM. Second, we propose an adaptive fusion\nstrategy that introduces multiple inertial measurement units as auxiliary\ntemporal information to enhance spatio-temporal perception. Last, we design an\nonline alignment strategy that encodes the temporal information as pseudo\nlabels for multi-modal alignment to further improve reconstruction performance.\nExtensive experimental validations on two large-scale datasets show remarkable\nimprovement from our method over competitors.\n","authors":["Zhongnuo Yan","Xin Yang","Mingyuan Luo","Jiongquan Chen","Rusi Chen","Lian Liu","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2407.04242v1.pdf","comment":"Accepted at MICCAI 2024. This is the submitted manuscript and the\n preprint has not undergone peer review (when applicable) or any\n post-submission improvements or corrections"},{"id":"http://arxiv.org/abs/2407.04241v1","updated":"2024-07-05T04:00:14Z","published":"2024-07-05T04:00:14Z","title":"AnySR: Realizing Image Super-Resolution as Any-Scale, Any-Resource","summary":" In an effort to improve the efficiency and scalability of single-image\nsuper-resolution (SISR) applications, we introduce AnySR, to rebuild existing\narbitrary-scale SR methods into any-scale, any-resource implementation. As a\ncontrast to off-the-shelf methods that solve SR tasks across various scales\nwith the same computing costs, our AnySR innovates in: 1) building\narbitrary-scale tasks as any-resource implementation, reducing resource\nrequirements for smaller scales without additional parameters; 2) enhancing\nany-scale performance in a feature-interweaving fashion, inserting scale pairs\ninto features at regular intervals and ensuring correct feature/scale\nprocessing. The efficacy of our AnySR is fully demonstrated by rebuilding most\nexisting arbitrary-scale SISR methods and validating on five popular SISR test\ndatasets. The results show that our AnySR implements SISR tasks in a\ncomputing-more-efficient fashion, and performs on par with existing\narbitrary-scale SISR methods. For the first time, we realize SISR tasks as not\nonly any-scale in literature, but also as any-resource. Code is available at\nhttps://github.com/CrispyFeSo4/AnySR.\n","authors":["Wengyi Zhan","Mingbao Lin","Chia-Wen Lin","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.04241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04237v1","updated":"2024-07-05T03:43:08Z","published":"2024-07-05T03:43:08Z","title":"GSD: View-Guided Gaussian Splatting Diffusion for 3D Reconstruction","summary":" We present GSD, a diffusion model approach based on Gaussian Splatting (GS)\nrepresentation for 3D object reconstruction from a single view. Prior works\nsuffer from inconsistent 3D geometry or mediocre rendering quality due to\nimproper representations. We take a step towards resolving these shortcomings\nby utilizing the recent state-of-the-art 3D explicit representation, Gaussian\nSplatting, and an unconditional diffusion model. This model learns to generate\n3D objects represented by sets of GS ellipsoids. With these strong generative\n3D priors, though learning unconditionally, the diffusion model is ready for\nview-guided reconstruction without further model fine-tuning. This is achieved\nby propagating fine-grained 2D features through the efficient yet flexible\nsplatting function and the guided denoising sampling process. In addition, a 2D\ndiffusion model is further employed to enhance rendering fidelity, and improve\nreconstructed GS quality by polishing and re-using the rendered images. The\nfinal reconstructed objects explicitly come with high-quality 3D structure and\ntexture, and can be efficiently rendered in arbitrary views. Experiments on the\nchallenging real-world CO3D dataset demonstrate the superiority of our\napproach.\n","authors":["Yuxuan Mu","Xinxin Zuo","Chuan Guo","Yilin Wang","Juwei Lu","Xiaofeng Wu","Songcen Xu","Peng Dai","Youliang Yan","Li Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.04237v1.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2404.17245v2","updated":"2024-07-05T03:28:16Z","published":"2024-04-26T08:35:46Z","title":"Parameter Efficient Fine-tuning of Self-supervised ViTs without\n Catastrophic Forgetting","summary":" Artificial neural networks often suffer from catastrophic forgetting, where\nlearning new concepts leads to a complete loss of previously acquired\nknowledge. We observe that this issue is particularly magnified in vision\ntransformers (ViTs), where post-pre-training and fine-tuning on new tasks can\nsignificantly degrade the model's original general abilities. For instance, a\nDINO ViT-Base/16 pre-trained on ImageNet-1k loses over 70% accuracy on\nImageNet-1k after just 10 iterations of fine-tuning on CIFAR-100. Overcoming\nthis stability-plasticity dilemma is crucial for enabling ViTs to continuously\nlearn and adapt to new domains while preserving their initial knowledge. In\nthis work, we study two new parameter-efficient fine-tuning strategies:\n(1)~Block Expansion, and (2) Low-rank adaptation (LoRA). Our experiments reveal\nthat using either Block Expansion or LoRA on self-supervised pre-trained ViTs\nsurpass fully fine-tuned ViTs in new domains while offering significantly\ngreater parameter efficiency. Notably, we find that Block Expansion experiences\nonly a minimal performance drop in the pre-training domain, thereby effectively\nmitigating catastrophic forgetting in pre-trained ViTs.\n","authors":["Reza Akbarian Bafghi","Nidhin Harilal","Claire Monteleoni","Maziar Raissi"],"pdf_url":"https://arxiv.org/pdf/2404.17245v2.pdf","comment":"Accepted at eLVM Workshop, CVPR, 2024"},{"id":"http://arxiv.org/abs/2407.04231v1","updated":"2024-07-05T03:19:32Z","published":"2024-07-05T03:19:32Z","title":"Efficient GANs for Document Image Binarization Based on DWT and\n Normalization","summary":" For document image binarization task, generative adversarial networks (GANs)\ncan generate images where shadows and noise are effectively removed, which\nallow for text information extraction. The current state-of-the-art (SOTA)\nmethod proposes a three-stage network architecture that utilizes six GANs.\nDespite its excellent model performance, the SOTA network architecture requires\nlong training and inference times. To overcome this problem, this work\nintroduces an efficient GAN method based on the three-stage network\narchitecture that incorporates the Discrete Wavelet Transformation and\nnormalization to reduce the input image size, which in turns, decrease both\ntraining and inference times. In addition, this work presents novel generators,\ndiscriminators, and loss functions to improve the model's performance.\nExperimental results show that the proposed method reduces the training time by\n10% and the inference time by 26% when compared to the SOTA method while\nmaintaining the model performance at 73.79 of Avg-Score. Our implementation\ncode is available on GitHub at\nhttps://github.com/RuiyangJu/Efficient_Document_Image_Binarization.\n","authors":["Rui-Yang Ju","KokSheik Wong","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2407.04231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00347v3","updated":"2024-07-05T03:16:22Z","published":"2023-07-01T13:53:14Z","title":"Spatial-Temporal Graph Enhanced DETR Towards Multi-Frame 3D Object\n Detection","summary":" The Detection Transformer (DETR) has revolutionized the design of CNN-based\nobject detection systems, showcasing impressive performance. However, its\npotential in the domain of multi-frame 3D object detection remains largely\nunexplored. In this paper, we present STEMD, a novel end-to-end framework that\nenhances the DETR-like paradigm for multi-frame 3D object detection by\naddressing three key aspects specifically tailored for this task. First, to\nmodel the inter-object spatial interaction and complex temporal dependencies,\nwe introduce the spatial-temporal graph attention network, which represents\nqueries as nodes in a graph and enables effective modeling of object\ninteractions within a social context. To solve the problem of missing hard\ncases in the proposed output of the encoder in the current frame, we\nincorporate the output of the previous frame to initialize the query input of\nthe decoder. Finally, it poses a challenge for the network to distinguish\nbetween the positive query and other highly similar queries that are not the\nbest match. And similar queries are insufficiently suppressed and turn into\nredundant prediction boxes. To address this issue, our proposed IoU\nregularization term encourages similar queries to be distinct during the\nrefinement. Through extensive experiments, we demonstrate the effectiveness of\nour approach in handling challenging scenarios, while incurring only a minor\nadditional computational overhead. The code is publicly available at\nhttps://github.com/Eaphan/STEMD.\n","authors":["Yifan Zhang","Zhiyu Zhu","Junhui Hou","Dapeng Wu"],"pdf_url":"https://arxiv.org/pdf/2307.00347v3.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.02700v2","updated":"2024-07-05T03:11:17Z","published":"2024-05-04T16:06:50Z","title":"Identification of Novel Modes in Generative Models via Fourier-based\n Differential Clustering","summary":" An interpretable comparison of generative models requires the identification\nof sample types produced more frequently by each of the involved models. While\nseveral quantitative scores have been proposed in the literature to rank\ndifferent generative models, such score-based evaluations do not reveal the\nnuanced differences between the generative models in capturing various sample\ntypes. In this work, we attempt to solve a differential clustering problem to\ndetect sample types expressed differently by two generative models. To solve\nthe differential clustering problem, we propose a method called Fourier-based\nIdentification of Novel Clusters (FINC) to identify modes produced by a\ngenerative model with a higher frequency in comparison to a reference\ndistribution. FINC provides a scalable stochastic algorithm based on random\nFourier features to estimate the eigenspace of kernel covariance matrices of\ntwo generative models and utilize the principal eigendirections to detect the\nsample types present more dominantly in each model. We demonstrate the\napplication of the FINC method to large-scale computer vision datasets and\ngenerative model frameworks. Our numerical results suggest the scalability of\nthe developed Fourier-based method in highlighting the sample types produced\nwith different frequencies by widely-used generative models. Code is available\nat \\url{https://github.com/buyeah1109/FINC}\n","authors":["Jingwei Zhang","Mohammad Jalali","Cheuk Ting Li","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2405.02700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04230v1","updated":"2024-07-05T03:10:13Z","published":"2024-07-05T03:10:13Z","title":"A Physical Model-Guided Framework for Underwater Image Enhancement and\n Depth Estimation","summary":" Due to the selective absorption and scattering of light by diverse aquatic\nmedia, underwater images usually suffer from various visual degradations.\nExisting underwater image enhancement (UIE) approaches that combine underwater\nphysical imaging models with neural networks often fail to accurately estimate\nimaging model parameters such as depth and veiling light, resulting in poor\nperformance in certain scenarios. To address this issue, we propose a physical\nmodel-guided framework for jointly training a Deep Degradation Model (DDM) with\nany advanced UIE model. DDM includes three well-designed sub-networks to\naccurately estimate various imaging parameters: a veiling light estimation\nsub-network, a factors estimation sub-network, and a depth estimation\nsub-network. Based on the estimated parameters and the underwater physical\nimaging model, we impose physical constraints on the enhancement process by\nmodeling the relationship between underwater images and desired clean images,\ni.e., outputs of the UIE model. Moreover, while our framework is compatible\nwith any UIE model, we design a simple yet effective fully convolutional UIE\nmodel, termed UIEConv. UIEConv utilizes both global and local features for\nimage enhancement through a dual-branch structure. UIEConv trained within our\nframework achieves remarkable enhancement results across diverse underwater\nscenes. Furthermore, as a byproduct of UIE, the trained depth estimation\nsub-network enables accurate underwater scene depth estimation. Extensive\nexperiments conducted in various real underwater imaging scenarios, including\ndeep-sea environments with artificial light sources, validate the effectiveness\nof our framework and the UIEConv model.\n","authors":["Dazhao Du","Enhan Li","Lingyu Si","Fanjiang Xu","Jianwei Niu","Fuchun Sun"],"pdf_url":"https://arxiv.org/pdf/2407.04230v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2406.18898v2","updated":"2024-07-05T02:56:10Z","published":"2024-06-27T05:26:38Z","title":"360 in the Wild: Dataset for Depth Prediction and View Synthesis","summary":" The large abundance of perspective camera datasets facilitated the emergence\nof novel learning-based strategies for various tasks, such as camera\nlocalization, single image depth estimation, or view synthesis. However,\npanoramic or omnidirectional image datasets, including essential information,\nsuch as pose and depth, are mostly made with synthetic scenes. In this work, we\nintroduce a large scale 360$^{\\circ}$ videos dataset in the wild. This dataset\nhas been carefully scraped from the Internet and has been captured from various\nlocations worldwide. Hence, this dataset exhibits very diversified environments\n(e.g., indoor and outdoor) and contexts (e.g., with and without moving\nobjects). Each of the 25K images constituting our dataset is provided with its\nrespective camera's pose and depth map. We illustrate the relevance of our\ndataset for two main tasks, namely, single image depth estimation and view\nsynthesis.\n","authors":["Kibaek Park","Francois Rameau","Jaesik Park","In So Kweon"],"pdf_url":"https://arxiv.org/pdf/2406.18898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04218v1","updated":"2024-07-05T02:13:47Z","published":"2024-07-05T02:13:47Z","title":"Batch Transformer: Look for Attention in Batch","summary":" Facial expression recognition (FER) has received considerable attention in\ncomputer vision, with \"in-the-wild\" environments such as human-computer\ninteraction. However, FER images contain uncertainties such as occlusion, low\nresolution, pose variation, illumination variation, and subjectivity, which\nincludes some expressions that do not match the target label. Consequently,\nlittle information is obtained from a noisy single image and it is not trusted.\nThis could significantly degrade the performance of the FER task. To address\nthis issue, we propose a batch transformer (BT), which consists of the proposed\nclass batch attention (CBA) module, to prevent overfitting in noisy data and\nextract trustworthy information by training on features reflected from several\nimages in a batch, rather than information from a single image. We also propose\nmulti-level attention (MLA) to prevent overfitting the specific features by\ncapturing correlations between each level. In this paper, we present a batch\ntransformer network (BTN) that combines the above proposals. Experimental\nresults on various FER benchmark datasets show that the proposed BTN\nconsistently outperforms the state-ofthe-art in FER datasets. Representative\nresults demonstrate the promise of the proposed BTN for FER.\n","authors":["Myung Beom Her","Jisu Jeong","Hojoon Song","Ji-Hyeong Han"],"pdf_url":"https://arxiv.org/pdf/2407.04218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07125v4","updated":"2024-07-05T01:59:23Z","published":"2023-11-13T07:34:53Z","title":"Attention-Challenging Multiple Instance Learning for Whole Slide Image\n Classification","summary":" In the application of Multiple Instance Learning (MIL) methods for Whole\nSlide Image (WSI) classification, attention mechanisms often focus on a subset\nof discriminative instances, which are closely linked to overfitting. To\nmitigate overfitting, we present Attention-Challenging MIL (ACMIL). ACMIL\ncombines two techniques based on separate analyses for attention value\nconcentration. Firstly, UMAP of instance features reveals various patterns\namong discriminative instances, with existing attention mechanisms capturing\nonly some of them. To remedy this, we introduce Multiple Branch Attention (MBA)\nto capture more discriminative instances using multiple attention branches.\nSecondly, the examination of the cumulative value of Top-K attention scores\nindicates that a tiny number of instances dominate the majority of attention.\nIn response, we present Stochastic Top-K Instance Masking (STKIM), which masks\nout a portion of instances with Top-K attention values and allocates their\nattention values to the remaining instances. The extensive experimental results\non three WSI datasets with two pre-trained backbones reveal that our ACMIL\noutperforms state-of-the-art methods. Additionally, through heatmap\nvisualization and UMAP visualization, this paper extensively illustrates\nACMIL's effectiveness in suppressing attention value concentration and\novercoming the overfitting challenge. The source code is available at\n\\url{https://github.com/dazhangyu123/ACMIL}.\n","authors":["Yunlong Zhang","Honglin Li","Yuxuan Sun","Sunyi Zheng","Chenglu Zhu","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.07125v4.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2406.14098v2","updated":"2024-07-05T01:56:29Z","published":"2024-06-20T08:24:28Z","title":"HeartBeat: Towards Controllable Echocardiography Video Synthesis with\n Multimodal Conditions-Guided Diffusion Models","summary":" Echocardiography (ECHO) video is widely used for cardiac examination. In\nclinical, this procedure heavily relies on operator experience, which needs\nyears of training and maybe the assistance of deep learning-based systems for\nenhanced accuracy and efficiency. However, it is challenging since acquiring\nsufficient customized data (e.g., abnormal cases) for novice training and deep\nmodel development is clinically unrealistic. Hence, controllable ECHO video\nsynthesis is highly desirable. In this paper, we propose a novel\ndiffusion-based framework named HeartBeat towards controllable and\nhigh-fidelity ECHO video synthesis. Our highlight is three-fold. First,\nHeartBeat serves as a unified framework that enables perceiving multimodal\nconditions simultaneously to guide controllable generation. Second, we\nfactorize the multimodal conditions into local and global ones, with two\ninsertion strategies separately provided fine- and coarse-grained controls in a\ncomposable and flexible manner. In this way, users can synthesize ECHO videos\nthat conform to their mental imagery by combining multimodal control signals.\nThird, we propose to decouple the visual concepts and temporal dynamics\nlearning using a two-stage training scheme for simplifying the model training.\nOne more interesting thing is that HeartBeat can easily generalize to\nmask-guided cardiac MRI synthesis in a few shots, showcasing its scalability to\nbroader applications. Extensive experiments on two public datasets show the\nefficacy of the proposed HeartBeat.\n","authors":["Xinrui Zhou","Yuhao Huang","Wufeng Xue","Haoran Dou","Jun Cheng","Han Zhou","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2406.14098v2.pdf","comment":"Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.04215v1","updated":"2024-07-05T01:53:21Z","published":"2024-07-05T01:53:21Z","title":"T2IShield: Defending Against Backdoors on Text-to-Image Diffusion Models","summary":" While text-to-image diffusion models demonstrate impressive generation\ncapabilities, they also exhibit vulnerability to backdoor attacks, which\ninvolve the manipulation of model outputs through malicious triggers. In this\npaper, for the first time, we propose a comprehensive defense method named\nT2IShield to detect, localize, and mitigate such attacks. Specifically, we find\nthe \"Assimilation Phenomenon\" on the cross-attention maps caused by the\nbackdoor trigger. Based on this key insight, we propose two effective backdoor\ndetection methods: Frobenius Norm Threshold Truncation and Covariance\nDiscriminant Analysis. Besides, we introduce a binary-search approach to\nlocalize the trigger within a backdoor sample and assess the efficacy of\nexisting concept editing methods in mitigating backdoor attacks. Empirical\nevaluations on two advanced backdoor attack scenarios show the effectiveness of\nour proposed defense method. For backdoor sample detection, T2IShield achieves\na detection F1 score of 88.9$\\%$ with low computational cost. Furthermore,\nT2IShield achieves a localization F1 score of 86.4$\\%$ and invalidates 99$\\%$\npoisoned samples. Codes are released at https://github.com/Robin-WZQ/T2IShield.\n","authors":["Zhongqi Wang","Jie Zhang","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2407.04215v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2308.10287v2","updated":"2024-07-05T01:37:27Z","published":"2023-08-20T14:53:27Z","title":"ASY-VRNet: Waterway Panoptic Driving Perception Model based on\n Asymmetric Fair Fusion of Vision and 4D mmWave Radar","summary":" Panoptic Driving Perception (PDP) is critical for the autonomous navigation\nof Unmanned Surface Vehicles (USVs). A PDP model typically integrates multiple\ntasks, necessitating the simultaneous and robust execution of various\nperception tasks to facilitate downstream path planning. The fusion of visual\nand radar sensors is currently acknowledged as a robust and cost-effective\napproach. However, most existing research has primarily focused on fusing\nvisual and radar features dedicated to object detection or utilizing a shared\nfeature space for multiple tasks, neglecting the individual representation\ndifferences between various tasks. To address this gap, we propose a pair of\nAsymmetric Fair Fusion (AFF) modules with favorable explainability designed to\nefficiently interact with independent features from both visual and radar\nmodalities, tailored to the specific requirements of object detection and\nsemantic segmentation tasks. The AFF modules treat image and radar maps as\nirregular point sets and transform these features into a crossed-shared feature\nspace for multitasking, ensuring equitable treatment of vision and radar point\ncloud features. Leveraging AFF modules, we propose a novel and efficient PDP\nmodel, ASY-VRNet, which processes image and radar features based on irregular\nsuper-pixel point sets. Additionally, we propose an effective multitask\nlearning method specifically designed for PDP models. Compared to other\nlightweight models, ASY-VRNet achieves state-of-the-art performance in object\ndetection, semantic segmentation, and drivable-area segmentation on the\nWaterScenes benchmark. Our project is publicly available at\nhttps://github.com/GuanRunwei/ASY-VRNet.\n","authors":["Runwei Guan","Shanliang Yao","Xiaohui Zhu","Ka Lok Man","Yong Yue","Jeremy Smith","Eng Gee Lim","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2308.10287v2.pdf","comment":"Accepted by IROS 2024"},{"id":"http://arxiv.org/abs/2407.04208v1","updated":"2024-07-05T01:35:42Z","published":"2024-07-05T01:35:42Z","title":"AMD: Automatic Multi-step Distillation of Large-scale Vision Models","summary":" Transformer-based architectures have become the de-facto standard models for\ndiverse vision tasks owing to their superior performance. As the size of the\nmodels continues to scale up, model distillation becomes extremely important in\nvarious real applications, particularly on devices limited by computational\nresources. However, prevailing knowledge distillation methods exhibit\ndiminished efficacy when confronted with a large capacity gap between the\nteacher and the student, e.g, 10x compression rate. In this paper, we present a\nnovel approach named Automatic Multi-step Distillation (AMD) for large-scale\nvision model compression. In particular, our distillation process unfolds\nacross multiple steps. Initially, the teacher undergoes distillation to form an\nintermediate teacher-assistant model, which is subsequently distilled further\nto the student. An efficient and effective optimization framework is introduced\nto automatically identify the optimal teacher-assistant that leads to the\nmaximal student performance. We conduct extensive experiments on multiple image\nclassification datasets, including CIFAR-10, CIFAR-100, and ImageNet. The\nfindings consistently reveal that our approach outperforms several established\nbaselines, paving a path for future knowledge distillation methods on\nlarge-scale vision models.\n","authors":["Cheng Han","Qifan Wang","Sohail A. Dianat","Majid Rabbani","Raghuveer M. Rao","Yi Fang","Qiang Guan","Lifu Huang","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.04208v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.04207v1","updated":"2024-07-05T01:30:42Z","published":"2024-07-05T01:30:42Z","title":"Elevating All Zero-Shot Sketch-Based Image Retrieval Through Multimodal\n Prompt Learning","summary":" We address the challenges inherent in sketch-based image retrieval (SBIR)\nacross various settings, including zero-shot SBIR, generalized zero-shot SBIR,\nand fine-grained zero-shot SBIR, by leveraging the vision-language foundation\nmodel, CLIP. While recent endeavors have employed CLIP to enhance SBIR, these\napproaches predominantly follow uni-modal prompt processing and overlook to\nfully exploit CLIP's integrated visual and textual capabilities. To bridge this\ngap, we introduce SpLIP, a novel multi-modal prompt learning scheme designed to\noperate effectively with frozen CLIP backbones. We diverge from existing\nmulti-modal prompting methods that either treat visual and textual prompts\nindependently or integrate them in a limited fashion, leading to suboptimal\ngeneralization. SpLIP implements a bi-directional prompt-sharing strategy that\nenables mutual knowledge exchange between CLIP's visual and textual encoders,\nfostering a more cohesive and synergistic prompt processing mechanism that\nsignificantly reduces the semantic gap between the sketch and photo embeddings.\nIn addition to pioneering multi-modal prompt learning, we propose two\ninnovative strategies for further refining the embedding space. The first is an\nadaptive margin generation for the sketch-photo triplet loss, regulated by\nCLIP's class textual embeddings. The second introduces a novel task, termed\nconditional cross-modal jigsaw, aimed at enhancing fine-grained sketch-photo\nalignment, by focusing on implicitly modelling the viable patch arrangement of\nsketches using knowledge of unshuffled photos. Our comprehensive experimental\nevaluations across multiple benchmarks demonstrate the superior performance of\nSpLIP in all three SBIR scenarios. Code is available at\nhttps://github.com/mainaksingha01/SpLIP.\n","authors":["Mainak Singha","Ankit Jha","Divyam Gupta","Pranav Singla","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2407.04207v1.pdf","comment":"Accepted in ECCV 2024"},{"id":"http://arxiv.org/abs/2407.04203v1","updated":"2024-07-05T01:02:12Z","published":"2024-07-05T01:02:12Z","title":"HCS-TNAS: Hybrid Constraint-driven Semi-supervised Transformer-NAS for\n Ultrasound Image Segmentation","summary":" Accurate ultrasound segmentation is pursued because it aids clinicians in\nachieving a comprehensive diagnosis. Due to the presence of low image quality\nand high costs associated with annotation, two primary concerns arise: (1)\nenhancing the understanding of multi-scale features, and (2) improving the\nresistance to data dependency. To mitigate these concerns, we propose HCS-TNAS,\na novel neural architecture search (NAS) method that automatically designs the\nnetwork. For the first concern, we employ multi-level searching encompassing\ncellular, layer, and module levels. Specifically, we design an Efficient\nNAS-ViT module that searches for multi-scale tokens in the vision Transformer\n(ViT) to capture context and local information, rather than relying solely on\nsimple combinations of operations. For the second concern, we propose a hybrid\nconstraint-driven semi-supervised learning method that considers additional\nnetwork independence and incorporates contrastive loss in a NAS formulation. By\nfurther developing a stage-wise optimization strategy, a rational network\nstructure can be identified. Extensive experiments on three publicly available\nultrasound image datasets demonstrate that HCS-TNAS effectively improves\nsegmentation accuracy and outperforms state-of-the-art methods.\n","authors":["Renqi Chen"],"pdf_url":"https://arxiv.org/pdf/2407.04203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04190v1","updated":"2024-07-05T00:18:40Z","published":"2024-07-05T00:18:40Z","title":"Computer Vision for Clinical Gait Analysis: A Gait Abnormality Video\n Dataset","summary":" Clinical gait analysis (CGA) using computer vision is an emerging field in\nartificial intelligence that faces barriers of accessible, real-world data, and\nclear task objectives. This paper lays the foundation for current developments\nin CGA as well as vision-based methods and datasets suitable for gait analysis.\nWe introduce The Gait Abnormality in Video Dataset (GAVD) in response to our\nreview of over 150 current gait-related computer vision datasets, which\nhighlighted the need for a large and accessible gait dataset clinically\nannotated for CGA. GAVD stands out as the largest video gait dataset,\ncomprising 1874 sequences of normal, abnormal and pathological gaits.\nAdditionally, GAVD includes clinically annotated RGB data sourced from publicly\navailable content on online platforms. It also encompasses over 400 subjects\nwho have undergone clinical grade visual screening to represent a diverse range\nof abnormal gait patterns, captured in various settings, including hospital\nclinics and urban uncontrolled outdoor environments. We demonstrate the\nvalidity of the dataset and utility of action recognition models for CGA using\npretrained models Temporal Segment Networks(TSN) and SlowFast network to\nachieve video abnormality detection of 94% and 92% respectively when tested on\nGAVD dataset. A GitHub repository https://github.com/Rahmyyy/GAVD consisting of\nconvenient URL links, and clinically relevant annotation for CGA is provided\nfor over 450 online videos, featuring diverse subjects performing a range of\nnormal, pathological, and abnormal gait patterns.\n","authors":["Rahm Ranjan","David Ahmedt-Aristizabal","Mohammad Ali Armin","Juno Kim"],"pdf_url":"https://arxiv.org/pdf/2407.04190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19486v3","updated":"2024-07-05T00:15:02Z","published":"2023-05-31T01:46:14Z","title":"Instance-dependent Noisy-label Learning with Graphical Model Based\n Noise-rate Estimation","summary":" Deep learning faces a formidable challenge when handling noisy labels, as\nmodels tend to overfit samples affected by label noise. This challenge is\nfurther compounded by the presence of instance-dependent noise (IDN), a\nrealistic form of label noise arising from ambiguous sample information. To\naddress IDN, Label Noise Learning (LNL) incorporates a sample selection stage\nto differentiate clean and noisy-label samples. This stage uses an arbitrary\ncriterion and a pre-defined curriculum that initially selects most samples as\nnoisy and gradually decreases this selection rate during training. Such\ncurriculum is sub-optimal since it does not consider the actual label noise\nrate in the training set. This paper addresses this issue with a new noise-rate\nestimation method that is easily integrated with most state-of-the-art (SOTA)\nLNL methods to produce a more effective curriculum. Synthetic and real-world\nbenchmark results demonstrate that integrating our approach with SOTA LNL\nmethods improves accuracy in most cases.\n","authors":["Arpit Garg","Cuong Nguyen","Rafael Felix","Thanh-Toan Do","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2305.19486v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2306.12992v2","updated":"2024-07-05T00:03:05Z","published":"2023-06-22T15:47:58Z","title":"Minimalist and High-Quality Panoramic Imaging with PSF-aware\n Transformers","summary":" High-quality panoramic images with a Field of View (FoV) of 360{\\deg} are\nessential for contemporary panoramic computer vision tasks. However,\nconventional imaging systems come with sophisticated lens designs and heavy\noptical components. This disqualifies their usage in many mobile and wearable\napplications where thin and portable, minimalist imaging systems are desired.\nIn this paper, we propose a Panoramic Computational Imaging Engine (PCIE) to\nachieve minimalist and high-quality panoramic imaging. With less than three\nspherical lenses, a Minimalist Panoramic Imaging Prototype (MPIP) is\nconstructed based on the design of the Panoramic Annular Lens (PAL), but with\nlow-quality imaging results due to aberrations and small image plane size. We\npropose two pipelines, i.e. Aberration Correction (AC) and Super-Resolution and\nAberration Correction (SR&AC), to solve the image quality problems of MPIP,\nwith imaging sensors of small and large pixel size, respectively. To leverage\nthe prior information of the optical system, we propose a Point Spread Function\n(PSF) representation method to produce a PSF map as an additional modality. A\nPSF-aware Aberration-image Recovery Transformer (PART) is designed as a\nuniversal network for the two pipelines, in which the self-attention\ncalculation and feature extraction are guided by the PSF map. We train PART on\nsynthetic image pairs from simulation and put forward the PALHQ dataset to fill\nthe gap of real-world high-quality PAL images for low-level vision. A\ncomprehensive variety of experiments on synthetic and real-world benchmarks\ndemonstrates the impressive imaging results of PCIE and the effectiveness of\nthe PSF representation. We further deliver heuristic experimental findings for\nminimalist and high-quality panoramic imaging. Our dataset and code will be\navailable at https://github.com/zju-jiangqi/PCIE-PART.\n","authors":["Qi Jiang","Shaohua Gao","Yao Gao","Kailun Yang","Zhonghua Yi","Hao Shi","Lei Sun","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2306.12992v2.pdf","comment":"Accepted to IEEE Transactions on Image Processing (TIP). The dataset\n and code will be available at https://github.com/zju-jiangqi/PCIE-PART"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.11509v2","updated":"2024-07-05T16:28:47Z","published":"2024-01-21T14:35:54Z","title":"Simple Domain Adaptation for Sparse Retrievers","summary":" In Information Retrieval, and more generally in Natural Language Processing,\nadapting models to specific domains is conducted through fine-tuning. Despite\nthe successes achieved by this method and its versatility, the need for\nhuman-curated and labeled data makes it impractical to transfer to new tasks,\ndomains, and/or languages when training data doesn't exist. Using the model\nwithout training (zero-shot) is another option that however suffers an\neffectiveness cost, especially in the case of first-stage retrievers. Numerous\nresearch directions have emerged to tackle these issues, most of them in the\ncontext of adapting to a task or a language. However, the literature is scarcer\nfor domain (or topic) adaptation. In this paper, we address this issue of\ncross-topic discrepancy for a sparse first-stage retriever by transposing a\nmethod initially designed for language adaptation. By leveraging pre-training\non the target data to learn domain-specific knowledge, this technique\nalleviates the need for annotated data and expands the scope of domain\nadaptation. Despite their relatively good generalization ability, we show that\neven sparse retrievers can benefit from our simple domain adaptation method.\n","authors":["Mathias Vast","Yuxuan Zong","Basile Van Cooten","Benjamin Piwowarski","Laure Soulier"],"pdf_url":"https://arxiv.org/pdf/2401.11509v2.pdf","comment":"Accepted at ECIR 2024"},{"id":"http://arxiv.org/abs/2208.06263v3","updated":"2024-07-05T16:05:41Z","published":"2022-08-10T13:18:00Z","title":"Probabilistic Rank and Reward: A Scalable Model for Slate Recommendation","summary":" We introduce Probabilistic Rank and Reward (PRR), a scalable probabilistic\nmodel for personalized slate recommendation. Our approach allows off-policy\nestimation of the reward in the scenario where the user interacts with at most\none item from a slate of K items. We show that the probability of a slate being\nsuccessful can be learned efficiently by combining the reward, whether the user\nsuccessfully interacted with the slate, and the rank, the item that was\nselected within the slate. PRR outperforms existing off-policy reward\noptimizing methods and is far more scalable to large action spaces. Moreover,\nPRR allows fast delivery of recommendations powered by maximum inner product\nsearch (MIPS), making it suitable in low latency domains such as computational\nadvertising.\n","authors":["Imad Aouali","Achraf Ait Sidi Hammou","Otmane Sakhi","David Rohde","Flavian Vasile"],"pdf_url":"https://arxiv.org/pdf/2208.06263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19309v2","updated":"2024-07-05T15:48:55Z","published":"2024-06-27T16:33:40Z","title":"Which Neurons Matter in IR? Applying Integrated Gradients-based Methods\n to Understand Cross-Encoders","summary":" With the recent addition of Retrieval-Augmented Generation (RAG), the scope\nand importance of Information Retrieval (IR) has expanded. As a result, the\nimportance of a deeper understanding of IR models also increases. However,\ninterpretability in IR remains under-explored, especially when it comes to the\nmodels' inner mechanisms. In this paper, we explore the possibility of adapting\nIntegrated Gradient-based methods in an IR context to identify the role of\nindividual neurons within the model. In particular, we provide new insights\ninto the role of what we call \"relevance\" neurons, as well as how they deal\nwith unseen data. Finally, we carry out an in-depth pruning study to validate\nour findings.\n","authors":["Mathias Vast","Basile Van Cooten","Laure Soulier","Benjamin Piwowarski"],"pdf_url":"https://arxiv.org/pdf/2406.19309v2.pdf","comment":"Accepted at ICTIR 2024"},{"id":"http://arxiv.org/abs/2407.04577v1","updated":"2024-07-05T15:12:14Z","published":"2024-07-05T15:12:14Z","title":"Optimizing Nepali PDF Extraction: A Comparative Study of Parser and OCR\n Technologies","summary":" This research compares PDF parsing and Optical Character Recognition (OCR)\nmethods for extracting Nepali content from PDFs. PDF parsing offers fast and\naccurate extraction but faces challenges with non-Unicode Nepali fonts. OCR,\nspecifically PyTesseract, overcomes these challenges, providing versatility for\nboth digital and scanned PDFs. The study reveals that while PDF parsers are\nfaster, their accuracy fluctuates based on PDF types. In contrast, OCRs, with a\nfocus on PyTesseract, demonstrate consistent accuracy at the expense of\nslightly longer extraction times. Considering the project's emphasis on Nepali\nPDFs, PyTesseract emerges as the most suitable library, balancing extraction\nspeed and accuracy.\n","authors":["Prabin Paudel","Supriya Khadka","Ranju G. C.","Rahul Shah"],"pdf_url":"https://arxiv.org/pdf/2407.04577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04573v1","updated":"2024-07-05T15:08:44Z","published":"2024-07-05T15:08:44Z","title":"VRSD: Rethinking Similarity and Diversity for Retrieval in Large\n Language Models","summary":" Vector retrieval algorithms are vital for semantic queries in the evolving\nlandscape of Large Language Models (LLMs). Retrieving vectors that\nsimultaneously meet criteria for both similarity and diversity significantly\nenhances the capabilities of LLM-based agents. Despite the widespread use of\nthe Maximal Marginal Relevance (MMR) in retrieval scenarios with relevance and\ndiversity requirements, fluctuations caused by variations in the parameter $\n\\lambda $ within the MMR complicate the determination of the optimization\ntrajectory in vector spaces, thus obscuring the direction of enhancement.\nMoreover, there is a lack of a robust theoretical analysis for the constraints\nof similarity and diversity in retrieval processes. This paper introduces a\nnovel approach to characterizing both constraints through the relationship\nbetween the sum vector and the query vector. The proximity of these vectors\naddresses the similarity constraint, while necessitating that individual\nvectors within the sum vector divergently align with the query vector to\nsatisfy the diversity constraint. We also formulate a new combinatorial\noptimization challenge, taking a selection of $k$ vectors from a set of\ncandidates such that their sum vector maximally aligns with the query vector, a\nproblem we demonstrate to be NP-complete. This establishes the profound\ndifficulty of pursuing similarity and diversity simultaneously in vector\nretrieval and lays a theoretical groundwork for further research. Additionally,\nwe present the heuristic algorithm Vectors Retrieval with Similarity and\nDiversity (VRSD) which not only has a definitive optimization goal and eschews\nthe need for preset parameters but also offers a modest reduction in time\ncomplexity compared to MMR. Empirical validation further confirm that VRSD\nsignificantly surpasses MMR across various datasets.\n","authors":["Hang Gao","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16807v2","updated":"2024-07-05T14:19:36Z","published":"2024-01-30T08:07:28Z","title":"Detecting LLM-Assisted Writing in Scientific Communication: Are We There\n Yet?","summary":" Large Language Models (LLMs), exemplified by ChatGPT, have significantly\nreshaped text generation, particularly in the realm of writing assistance.\nWhile ethical considerations underscore the importance of transparently\nacknowledging LLM use, especially in scientific communication, genuine\nacknowledgment remains infrequent. A potential avenue to encourage accurate\nacknowledging of LLM-assisted writing involves employing automated detectors.\nOur evaluation of four cutting-edge LLM-generated text detectors reveals their\nsuboptimal performance compared to a simple ad-hoc detector designed to\nidentify abrupt writing style changes around the time of LLM proliferation. We\ncontend that the development of specialized detectors exclusively dedicated to\nLLM-assisted writing detection is necessary. Such detectors could play a\ncrucial role in fostering more authentic recognition of LLM involvement in\nscientific communication, addressing the current challenges in acknowledgment\npractices.\n","authors":["Teddy Lazebnik","Ariel Rosenfeld"],"pdf_url":"https://arxiv.org/pdf/2401.16807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04528v1","updated":"2024-07-05T14:16:47Z","published":"2024-07-05T14:16:47Z","title":"GPT vs RETRO: Exploring the Intersection of Retrieval and\n Parameter-Efficient Fine-Tuning","summary":" Parameter-Efficient Fine-Tuning (PEFT) and Retrieval-Augmented Generation\n(RAG) have become popular methods for adapting large language models while\nminimizing compute requirements. In this paper, we apply PEFT methods\n(P-tuning, Adapters, and LoRA) to a modified Retrieval-Enhanced Transformer\n(RETRO) and a baseline GPT model across several sizes, ranging from 823 million\nto 48 billion parameters. We show that RETRO models outperform GPT models in\nzero-shot settings due to their unique pre-training process but GPT models have\nhigher performance potential with PEFT. Additionally, our study indicates that\n8B parameter models strike an optimal balance between cost and performance and\nP-tuning lags behind other PEFT techniques. We further provide a comparative\nanalysis of between applying PEFT to an Instruction-tuned RETRO model and base\nRETRO model. This work presents the first comprehensive comparison of various\nPEFT methods integrated with RAG, applied to both GPT and RETRO models,\nhighlighting their relative performance.\n","authors":["Aleksander Ficek","Jiaqi Zeng","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2407.04528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04472v1","updated":"2024-07-05T12:42:31Z","published":"2024-07-05T12:42:31Z","title":"EventChat: Implementation and user-centric evaluation of a large\n language model-driven conversational recommender system for exploring leisure\n events in an SME context","summary":" Large language models (LLMs) present an enormous evolution in the strategic\npotential of conversational recommender systems (CRS). Yet to date, research\nhas predominantly focused upon technical frameworks to implement LLM-driven\nCRS, rather than end-user evaluations or strategic implications for firms,\nparticularly from the perspective of a small to medium enterprises (SME) that\nmakeup the bedrock of the global economy. In the current paper, we detail the\ndesign of an LLM-driven CRS in an SME setting, and its subsequent performance\nin the field using both objective system metrics and subjective user\nevaluations. While doing so, we additionally outline a short-form revised\nResQue model for evaluating LLM-driven CRS, enabling replicability in a rapidly\nevolving field. Our results reveal good system performance from a user\nexperience perspective (85.5% recommendation accuracy) but underscore latency,\ncost, and quality issues challenging business viability. Notably, with a median\ncost of $0.04 per interaction and a latency of 5.7s, cost-effectiveness and\nresponse time emerge as crucial areas for achieving a more user-friendly and\neconomically viable LLM-driven CRS for SME settings. One major driver of these\ncosts is the use of an advanced LLM as a ranker within the retrieval-augmented\ngeneration (RAG) technique. Our results additionally indicate that relying\nsolely on approaches such as Prompt-based learning with ChatGPT as the\nunderlying LLM makes it challenging to achieve satisfying quality in a\nproduction environment. Strategic considerations for SMEs deploying an\nLLM-driven CRS are outlined, particularly considering trade-offs in the current\ntechnical landscape.\n","authors":["Hannes Kunstmann","Joseph Ollier","Joel Persson","Florian von Wangenheim"],"pdf_url":"https://arxiv.org/pdf/2407.04472v1.pdf","comment":"27 pages, 3 tables, 5 figures, pre-print manuscript"},{"id":"http://arxiv.org/abs/2401.10690v2","updated":"2024-07-05T10:19:36Z","published":"2024-01-19T13:41:08Z","title":"Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and\n unfairness in dyadic regression models","summary":" Dyadic regression models, which predict real-valued outcomes for pairs of\nentities, are fundamental in many domains (e.g. predicting the rating of a user\nto a product in Recommender Systems) and promising and under exploration in\nmany others (e.g. approximating the adequate dosage of a drug for a patient in\npersonalized pharmacology). In this work, we demonstrate that non-uniformity in\nthe observed value distributions of individual entities leads to severely\nbiased predictions in state-of-the-art models, skewing predictions towards the\naverage of observed past values for the entity and providing worse-than-random\npredictive power in eccentric yet equally important cases. We show that the\nusage of global error metrics like Root Mean Squared Error (RMSE) and Mean\nAbsolute Error (MAE) is insufficient to capture this phenomenon, which we name\neccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as\na new complementary metric that can quantify it in all studied models and\ndatasets. We also prove the adequateness of EAUC by using naive de-biasing\ncorrections to demonstrate that a lower model bias correlates with a lower EAUC\nand vice-versa. This work contributes a bias-aware evaluation of dyadic\nregression models to avoid potential unfairness and risks in critical\nreal-world applications of such systems.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Bertha Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2401.10690v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04217v1","updated":"2024-07-05T02:01:49Z","published":"2024-07-05T02:01:49Z","title":"An Interactive Multi-modal Query Answering System with\n Retrieval-Augmented Large Language Models","summary":" Retrieval-augmented Large Language Models (LLMs) have reshaped traditional\nquery-answering systems, offering unparalleled user experiences. However,\nexisting retrieval techniques often struggle to handle multi-modal query\ncontexts. In this paper, we present an interactive Multi-modal Query Answering\n(MQA) system, empowered by our newly developed multi-modal retrieval framework\nand navigation graph index, integrated with cutting-edge LLMs. It comprises\nfive core components: Data Preprocessing, Vector Representation, Index\nConstruction, Query Execution, and Answer Generation, all orchestrated by a\ndedicated coordinator to ensure smooth data flow from input to answer\ngeneration. One notable aspect of MQA is its utilization of contrastive\nlearning to assess the significance of different modalities, facilitating\nprecise measurement of multi-modal information similarity. Furthermore, the\nsystem achieves efficient retrieval through our advanced navigation graph\nindex, refined using computational pruning techniques. Another highlight of our\nsystem is its pluggable processing framework, allowing seamless integration of\nembedding models, graph indexes, and LLMs. This flexibility provides users\ndiverse options for gaining insights from their multi-modal knowledge base. A\npreliminary video introduction of MQA is available at\nhttps://youtu.be/xvUuo2ZIqWk.\n","authors":["Mengzhao Wang","Haotian Wu","Xiangyu Ke","Yunjun Gao","Xiaoliang Xu","Lu Chen"],"pdf_url":"https://arxiv.org/pdf/2407.04217v1.pdf","comment":"This demo paper has been accepted by VLDB 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2405.14868v2","updated":"2024-07-05T17:59:57Z","published":"2024-05-23T17:59:52Z","title":"Generative Camera Dolly: Extreme Monocular Dynamic Novel View Synthesis","summary":" Accurate reconstruction of complex dynamic scenes from just a single\nviewpoint continues to be a challenging task in computer vision. Current\ndynamic novel view synthesis methods typically require videos from many\ndifferent camera viewpoints, necessitating careful recording setups, and\nsignificantly restricting their utility in the wild as well as in terms of\nembodied AI applications. In this paper, we propose $\\textbf{GCD}$, a\ncontrollable monocular dynamic view synthesis pipeline that leverages\nlarge-scale diffusion priors to, given a video of any scene, generate a\nsynchronous video from any other chosen perspective, conditioned on a set of\nrelative camera pose parameters. Our model does not require depth as input, and\ndoes not explicitly model 3D scene geometry, instead performing end-to-end\nvideo-to-video translation in order to achieve its goal efficiently. Despite\nbeing trained on synthetic multi-view video data only, zero-shot real-world\ngeneralization experiments show promising results in multiple domains,\nincluding robotics, object permanence, and driving environments. We believe our\nframework can potentially unlock powerful applications in rich dynamic scene\nunderstanding, perception for robotics, and interactive 3D video viewing\nexperiences for virtual reality.\n","authors":["Basile Van Hoorick","Rundi Wu","Ege Ozguroglu","Kyle Sargent","Ruoshi Liu","Pavel Tokmakov","Achal Dave","Changxi Zheng","Carl Vondrick"],"pdf_url":"https://arxiv.org/pdf/2405.14868v2.pdf","comment":"Accepted to ECCV 2024. Project webpage is available at:\n https://gcd.cs.columbia.edu/"},{"id":"http://arxiv.org/abs/2407.04694v1","updated":"2024-07-05T17:57:02Z","published":"2024-07-05T17:57:02Z","title":"Me, Myself, and AI: The Situational Awareness Dataset (SAD) for LLMs","summary":" AI assistants such as ChatGPT are trained to respond to users by saying, \"I\nam a large language model\". This raises questions. Do such models know that\nthey are LLMs and reliably act on this knowledge? Are they aware of their\ncurrent circumstances, such as being deployed to the public? We refer to a\nmodel's knowledge of itself and its circumstances as situational awareness. To\nquantify situational awareness in LLMs, we introduce a range of behavioral\ntests, based on question answering and instruction following. These tests form\nthe $\\textbf{Situational Awareness Dataset (SAD)}$, a benchmark comprising 7\ntask categories and over 13,000 questions. The benchmark tests numerous\nabilities, including the capacity of LLMs to (i) recognize their own generated\ntext, (ii) predict their own behavior, (iii) determine whether a prompt is from\ninternal evaluation or real-world deployment, and (iv) follow instructions that\ndepend on self-knowledge.\n We evaluate 16 LLMs on SAD, including both base (pretrained) and chat models.\nWhile all models perform better than chance, even the highest-scoring model\n(Claude 3 Opus) is far from a human baseline on certain tasks. We also observe\nthat performance on SAD is only partially predicted by metrics of general\nknowledge (e.g. MMLU). Chat models, which are finetuned to serve as AI\nassistants, outperform their corresponding base models on SAD but not on\ngeneral knowledge tasks. The purpose of SAD is to facilitate scientific\nunderstanding of situational awareness in LLMs by breaking it down into\nquantitative abilities. Situational awareness is important because it enhances\na model's capacity for autonomous planning and action. While this has potential\nbenefits for automation, it also introduces novel risks related to AI safety\nand control. Code and latest results available at\nhttps://situational-awareness-dataset.org .\n","authors":["Rudolf Laine","Bilal Chughtai","Jan Betley","Kaivalya Hariharan","Jeremy Scheurer","Mikita Balesni","Marius Hobbhahn","Alexander Meinke","Owain Evans"],"pdf_url":"https://arxiv.org/pdf/2407.04694v1.pdf","comment":"11 page main body, 98 page appendix, 58 figures"},{"id":"http://arxiv.org/abs/2407.04690v1","updated":"2024-07-05T17:53:03Z","published":"2024-07-05T17:53:03Z","title":"Missed Causes and Ambiguous Effects: Counterfactuals Pose Challenges for\n Interpreting Neural Networks","summary":" Interpretability research takes counterfactual theories of causality for\ngranted. Most causal methods rely on counterfactual interventions to inputs or\nthe activations of particular model components, followed by observations of the\nchange in models' output logits or behaviors. While this yields more faithful\nevidence than correlational methods, counterfactuals nonetheless have key\nproblems that bias our findings in specific and predictable ways. Specifically,\n(i) counterfactual theories do not effectively capture multiple independently\nsufficient causes of the same effect, which leads us to miss certain causes\nentirely; and (ii) counterfactual dependencies in neural networks are generally\nnot transitive, which complicates methods for extracting and interpreting\ncausal graphs from neural networks. We discuss the implications of these\nchallenges for interpretability researchers and propose concrete suggestions\nfor future work.\n","authors":["Aaron Mueller"],"pdf_url":"https://arxiv.org/pdf/2407.04690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04681v1","updated":"2024-07-05T17:43:30Z","published":"2024-07-05T17:43:30Z","title":"Rethinking Visual Prompting for Multimodal Large Language Models with\n External Knowledge","summary":" In recent years, multimodal large language models (MLLMs) have made\nsignificant strides by training on vast high-quality image-text datasets,\nenabling them to generally understand images well. However, the inherent\ndifficulty in explicitly conveying fine-grained or spatially dense information\nin text, such as masks, poses a challenge for MLLMs, limiting their ability to\nanswer questions requiring an understanding of detailed or localized visual\nelements. Drawing inspiration from the Retrieval-Augmented Generation (RAG)\nconcept, this paper proposes a new visual prompt approach to integrate\nfine-grained external knowledge, gleaned from specialized vision models (e.g.,\ninstance segmentation/OCR models), into MLLMs. This is a promising yet\nunderexplored direction for enhancing MLLMs' performance. Our approach diverges\nfrom concurrent works, which transform external knowledge into additional text\nprompts, necessitating the model to indirectly learn the correspondence between\nvisual content and text coordinates. Instead, we propose embedding fine-grained\nknowledge information directly into a spatial embedding map as a visual prompt.\nThis design can be effortlessly incorporated into various MLLMs, such as LLaVA\nand Mipha, considerably improving their visual understanding performance.\nThrough rigorous experiments, we demonstrate that our method can enhance MLLM\nperformance across nine benchmarks, amplifying their fine-grained context-aware\ncapabilities.\n","authors":["Yuanze Lin","Yunsheng Li","Dongdong Chen","Weijian Xu","Ronald Clark","Philip Torr","Lu Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.04681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04667v1","updated":"2024-07-05T17:22:12Z","published":"2024-07-05T17:22:12Z","title":"The diameter of a stochastic matrix: A new measure for sensitivity\n analysis in Bayesian networks","summary":" Bayesian networks are one of the most widely used classes of probabilistic\nmodels for risk management and decision support because of their\ninterpretability and flexibility in including heterogeneous pieces of\ninformation. In any applied modelling, it is critical to assess how robust the\ninferences on certain target variables are to changes in the model. In Bayesian\nnetworks, these analyses fall under the umbrella of sensitivity analysis, which\nis most commonly carried out by quantifying dissimilarities using\nKullback-Leibler information measures. In this paper, we argue that robustness\nmethods based instead on the familiar total variation distance provide simple\nand more valuable bounds on robustness to misspecification, which are both\nformally justifiable and transparent. We introduce a novel measure of\ndependence in conditional probability tables called the diameter to derive such\nbounds. This measure quantifies the strength of dependence between a variable\nand its parents. We demonstrate how such formal robustness considerations can\nbe embedded in building a Bayesian network.\n","authors":["Manuele Leonelli","Jim Q. Smith","Sophia K. Wright"],"pdf_url":"https://arxiv.org/pdf/2407.04667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04663v1","updated":"2024-07-05T17:18:46Z","published":"2024-07-05T17:18:46Z","title":"Unsupervised 4D Cardiac Motion Tracking with Spatiotemporal Optical Flow\n Networks","summary":" Cardiac motion tracking from echocardiography can be used to estimate and\nquantify myocardial motion within a cardiac cycle. It is a cost-efficient and\neffective approach for assessing myocardial function. However, ultrasound\nimaging has the inherent characteristics of spatially low resolution and\ntemporally random noise, which leads to difficulties in obtaining reliable\nannotation. Thus it is difficult to perform supervised learning for motion\ntracking. In addition, there is no end-to-end unsupervised method currently in\nthe literature. This paper presents a motion tracking method where unsupervised\noptical flow networks are designed with spatial reconstruction loss and\ntemporal-consistency loss. Our proposed loss functions make use of the\npair-wise and temporal correlation to estimate cardiac motion from noisy\nbackground. Experiments using a synthetic 4D echocardiography dataset has shown\nthe effectiveness of our approach, and its superiority over existing methods on\nboth accuracy and running speed. To the best of our knowledge, this is the\nfirst work performed that uses unsupervised end-to-end deep learning optical\nflow network for 4D cardiac motion tracking.\n","authors":["Long Teng","Wei Feng","Menglong Zhu","Xinchao Li"],"pdf_url":"https://arxiv.org/pdf/2407.04663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04662v1","updated":"2024-07-05T17:18:25Z","published":"2024-07-05T17:18:25Z","title":"Multitaper mel-spectrograms for keyword spotting","summary":" Keyword spotting (KWS) is one of the speech recognition tasks most sensitive\nto the quality of the feature representation. However, the research on KWS has\ntraditionally focused on new model topologies, putting little emphasis on other\naspects like feature extraction. This paper investigates the use of the\nmultitaper technique to create improved features for KWS. The experimental\nstudy is carried out for different test scenarios, windows and parameters,\ndatasets, and neural networks commonly used in embedded KWS applications.\nExperiment results confirm the advantages of using the proposed improved\nfeatures.\n","authors":["Douglas Baptista de Souza","Khaled Jamal Bakri","Fernanda Ferreira","Juliana Inacio"],"pdf_url":"https://arxiv.org/pdf/2407.04662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01864v2","updated":"2024-07-05T17:17:48Z","published":"2024-07-02T00:43:41Z","title":"Research on target detection method of distracted driving behavior based\n on improved YOLOv8","summary":" With the development of deep learning technology, the detection and\nclassification of distracted driving behaviour requires higher accuracy.\nExisting deep learning-based methods are computationally intensive and\nparameter redundant, limiting the efficiency and accuracy in practical\napplications. To solve this problem, this study proposes an improved YOLOv8\ndetection method based on the original YOLOv8 model by integrating the BoTNet\nmodule, GAM attention mechanism and EIoU loss function. By optimising the\nfeature extraction and multi-scale feature fusion strategies, the training and\ninference processes are simplified, and the detection accuracy and efficiency\nare significantly improved. Experimental results show that the improved model\nperforms well in both detection speed and accuracy, with an accuracy rate of\n99.4%, and the model is smaller and easy to deploy, which is able to identify\nand classify distracted driving behaviours in real time, provide timely\nwarnings, and enhance driving safety.\n","authors":["Shiquan Shen","Zhizhong Wu","Pan Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.01864v2.pdf","comment":"Major revision on content, no replacement available soon"},{"id":"http://arxiv.org/abs/2407.04656v1","updated":"2024-07-05T17:13:41Z","published":"2024-07-05T17:13:41Z","title":"Lazarus: Resilient and Elastic Training of Mixture-of-Experts Models\n with Adaptive Expert Placement","summary":" Sparsely-activated Mixture-of-Experts (MoE) architecture has increasingly\nbeen adopted to further scale large language models (LLMs) due to its\nsub-linear scaling for computation costs. However, frequent failures still pose\nsignificant challenges as training scales. The cost of even a single failure is\nsignificant, as all GPUs need to wait idle until the failure is resolved,\npotentially losing considerable training progress as training has to restart\nfrom checkpoints. Existing solutions for efficient fault-tolerant training\neither lack elasticity or rely on building resiliency into pipeline\nparallelism, which cannot be applied to MoE models due to the expert\nparallelism strategy adopted by the MoE architecture.\n We present Lazarus, a system for resilient and elastic training of MoE\nmodels. Lazarus adaptively allocates expert replicas to address the inherent\nimbalance in expert workload and speeds-up training, while a provably optimal\nexpert placement algorithm is developed to maximize the probability of recovery\nupon failures. Through adaptive expert placement and a flexible token\ndispatcher, Lazarus can also fully utilize all available nodes after failures,\nleaving no GPU idle. Our evaluation shows that Lazarus outperforms existing MoE\ntraining systems by up to 5.7x under frequent node failures and 3.4x on a real\nspot instance trace.\n","authors":["Yongji Wu","Wenjie Qu","Tianyang Tao","Zhuang Wang","Wei Bai","Zhuohao Li","Yuan Tian","Jiaheng Zhang","Matthew Lentz","Danyang Zhuo"],"pdf_url":"https://arxiv.org/pdf/2407.04656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04687v2","updated":"2024-07-05T16:58:15Z","published":"2024-04-06T17:23:43Z","title":"Z-Splat: Z-Axis Gaussian Splatting for Camera-Sonar Fusion","summary":" Differentiable 3D-Gaussian splatting (GS) is emerging as a prominent\ntechnique in computer vision and graphics for reconstructing 3D scenes. GS\nrepresents a scene as a set of 3D Gaussians with varying opacities and employs\na computationally efficient splatting operation along with analytical\nderivatives to compute the 3D Gaussian parameters given scene images captured\nfrom various viewpoints. Unfortunately, capturing surround view ($360^{\\circ}$\nviewpoint) images is impossible or impractical in many real-world imaging\nscenarios, including underwater imaging, rooms inside a building, and\nautonomous navigation. In these restricted baseline imaging scenarios, the GS\nalgorithm suffers from a well-known 'missing cone' problem, which results in\npoor reconstruction along the depth axis. In this manuscript, we demonstrate\nthat using transient data (from sonars) allows us to address the missing cone\nproblem by sampling high-frequency data along the depth axis. We extend the\nGaussian splatting algorithms for two commonly used sonars and propose fusion\nalgorithms that simultaneously utilize RGB camera data and sonar data. Through\nsimulations, emulations, and hardware experiments across various imaging\nscenarios, we show that the proposed fusion algorithms lead to significantly\nbetter novel view synthesis (5 dB improvement in PSNR) and 3D geometry\nreconstruction (60% lower Chamfer distance).\n","authors":["Ziyuan Qu","Omkar Vengurlekar","Mohamad Qadri","Kevin Zhang","Michael Kaess","Christopher Metzler","Suren Jayasuriya","Adithya Pediredla"],"pdf_url":"https://arxiv.org/pdf/2404.04687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14657v2","updated":"2024-07-05T16:51:15Z","published":"2024-06-20T18:22:59Z","title":"OpenDebateEvidence: A Massive-Scale Argument Mining and Summarization\n Dataset","summary":" We introduce OpenDebateEvidence, a comprehensive dataset for argument mining\nand summarization sourced from the American Competitive Debate community. This\ndataset includes over 3.5 million documents with rich metadata, making it one\nof the most extensive collections of debate evidence. OpenDebateEvidence\ncaptures the complexity of arguments in high school and college debates,\nproviding valuable resources for training and evaluation. Our extensive\nexperiments demonstrate the efficacy of fine-tuning state-of-the-art large\nlanguage models for argumentative abstractive summarization across various\nmethods, models, and datasets. By providing this comprehensive resource, we aim\nto advance computational argumentation and support practical applications for\ndebaters, educators, and researchers. OpenDebateEvidence is publicly available\nto support further research and innovation in computational argumentation.\nAccess it here: https://huggingface.co/datasets/Yusuf5/OpenCaselist\n","authors":["Allen Roush","Yusuf Shabazz","Arvind Balaji","Peter Zhang","Stefano Mezza","Markus Zhang","Sanjay Basu","Sriram Vishwanath","Mehdi Fatemi","Ravid Shwartz-Ziv"],"pdf_url":"https://arxiv.org/pdf/2406.14657v2.pdf","comment":"Accepted for Publication to ARGMIN 2024 at ACL2024"},{"id":"http://arxiv.org/abs/2407.04631v1","updated":"2024-07-05T16:41:49Z","published":"2024-07-05T16:41:49Z","title":"An autoencoder for compressing angle-resolved photoemission spectroscopy\n data","summary":" Angle-resolved photoemission spectroscopy (ARPES) is a powerful experimental\ntechnique to determine the electronic structure of solids. Advances in light\nsources for ARPES experiments are currently leading to a vast increase of data\nacquisition rates and data quantity. On the other hand, access time to the most\nadvanced ARPES instruments remains strictly limited, calling for fast,\neffective, and on-the-fly data analysis tools to exploit this time. In response\nto this need, we introduce ARPESNet, a versatile autoencoder network that\nefficiently summmarises and compresses ARPES datasets. We train ARPESNet on a\nlarge and varied dataset of 2-dimensional ARPES data extracted by cutting\nstandard 3-dimensional ARPES datasets along random directions in $\\mathbf{k}$.\nTo test the data representation capacity of ARPESNet, we compare $k$-means\nclustering quality between data compressed by ARPESNet, data compressed by\ndiscrete cosine transform, and raw data, at different noise levels. ARPESNet\ndata excels in clustering quality despite its high compression ratio.\n","authors":["Steinn Ymir Agustsson","Mohammad Ahsanul Haque","Thi Tam Truong","Marco Bianchi","Nikita Klyuchnikov","Davide Mottin","Panagiotis Karras","Philip Hofmann"],"pdf_url":"https://arxiv.org/pdf/2407.04631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04622v1","updated":"2024-07-05T16:29:15Z","published":"2024-07-05T16:29:15Z","title":"On scalable oversight with weak LLMs judging strong LLMs","summary":" Scalable oversight protocols aim to enable humans to accurately supervise\nsuperhuman AI. In this paper we study debate, where two AI's compete to\nconvince a judge; consultancy, where a single AI tries to convince a judge that\nasks questions; and compare to a baseline of direct question-answering, where\nthe judge just answers outright without the AI. We use large language models\n(LLMs) as both AI agents and as stand-ins for human judges, taking the judge\nmodels to be weaker than agent models. We benchmark on a diverse range of\nasymmetries between judges and agents, extending previous work on a single\nextractive QA task with information asymmetry, to also include mathematics,\ncoding, logic and multimodal reasoning asymmetries. We find that debate\noutperforms consultancy across all tasks when the consultant is randomly\nassigned to argue for the correct/incorrect answer. Comparing debate to direct\nquestion answering, the results depend on the type of task: in extractive QA\ntasks with information asymmetry debate outperforms direct question answering,\nbut in other tasks without information asymmetry the results are mixed.\nPrevious work assigned debaters/consultants an answer to argue for. When we\nallow them to instead choose which answer to argue for, we find judges are less\nfrequently convinced by the wrong answer in debate than in consultancy.\nFurther, we find that stronger debater models increase judge accuracy, though\nmore modestly than in previous studies.\n","authors":["Zachary Kenton","Noah Y. Siegel","János Kramár","Jonah Brown-Cohen","Samuel Albanie","Jannis Bulian","Rishabh Agarwal","David Lindner","Yunhao Tang","Noah D. Goodman","Rohin Shah"],"pdf_url":"https://arxiv.org/pdf/2407.04622v1.pdf","comment":"15 pages (53 including appendices)"},{"id":"http://arxiv.org/abs/2403.06725v3","updated":"2024-07-05T16:24:29Z","published":"2024-03-11T13:44:43Z","title":"Improving Low-Resource Knowledge Tracing Tasks by Supervised\n Pre-training and Importance Mechanism Fine-tuning","summary":" Knowledge tracing (KT) aims to estimate student's knowledge mastery based on\ntheir historical interactions. Recently, the deep learning based KT (DLKT)\napproaches have achieved impressive performance in the KT task. These DLKT\nmodels heavily rely on the large number of available student interactions.\nHowever, due to various reasons such as budget constraints and privacy\nconcerns, observed interactions are very limited in many real-world scenarios,\na.k.a, low-resource KT datasets. Directly training a DLKT model on a\nlow-resource KT dataset may lead to overfitting and it is difficult to choose\nthe appropriate deep neural architecture. Therefore, in this paper, we propose\na low-resource KT framework called LoReKT to address above challenges. Inspired\nby the prevalent \"pre-training and fine-tuning\" paradigm, we aim to learn\ntransferable parameters and representations from rich-resource KT datasets\nduring the pre-training stage and subsequently facilitate effective adaptation\nto low-resource KT datasets. Specifically, we simplify existing sophisticated\nDLKT model architectures with purely a stack of transformer decoders. We design\nan encoding mechanism to incorporate student interactions from multiple KT data\nsources and develop an importance mechanism to prioritize updating parameters\nwith high importance while constraining less important ones during the\nfine-tuning stage. We evaluate LoReKT on six public KT datasets and\nexperimental results demonstrate the superiority of our approach in terms of\nAUC and Accuracy. To encourage reproducible research, we make our data and code\npublicly available at https://anonymous.4open.science/r/LoReKT-C619.\n","authors":["Hengyuan Zhang","Zitao Liu","Shuyan Huang","Chenming Shang","Bojun Zhan","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.06725v3.pdf","comment":"29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.04620v1","updated":"2024-07-05T16:23:20Z","published":"2024-07-05T16:23:20Z","title":"Learning to (Learn at Test Time): RNNs with Expressive Hidden States","summary":" Self-attention performs well in long context but has quadratic complexity.\nExisting RNN layers have linear complexity, but their performance in long\ncontext is limited by the expressive power of their hidden state. We propose a\nnew class of sequence modeling layers with linear complexity and an expressive\nhidden state. The key idea is to make the hidden state a machine learning model\nitself, and the update rule a step of self-supervised learning. Since the\nhidden state is updated by training even on test sequences, our layers are\ncalled Test-Time Training (TTT) layers. We consider two instantiations:\nTTT-Linear and TTT-MLP, whose hidden state is a linear model and a two-layer\nMLP respectively. We evaluate our instantiations at the scale of 125M to 1.3B\nparameters, comparing with a strong Transformer and Mamba, a modern RNN. Both\nTTT-Linear and TTT-MLP match or exceed the baselines. Similar to Transformer,\nthey can keep reducing perplexity by conditioning on more tokens, while Mamba\ncannot after 16k context. With preliminary systems optimization, TTT-Linear is\nalready faster than Transformer at 8k context and matches Mamba in wall-clock\ntime. TTT-MLP still faces challenges in memory I/O, but shows larger potential\nin long context, pointing to a promising direction for future research.\n","authors":["Yu Sun","Xinhao Li","Karan Dalal","Jiarui Xu","Arjun Vikram","Genghan Zhang","Yann Dubois","Xinlei Chen","Xiaolong Wang","Sanmi Koyejo","Tatsunori Hashimoto","Carlos Guestrin"],"pdf_url":"https://arxiv.org/pdf/2407.04620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04617v1","updated":"2024-07-05T16:16:47Z","published":"2024-07-05T16:16:47Z","title":"Randomized Physics-Informed Neural Networks for Bayesian Data\n Assimilation","summary":" We propose a randomized physics-informed neural network (PINN) or rPINN\nmethod for uncertainty quantification in inverse partial differential equation\n(PDE) problems with noisy data. This method is used to quantify uncertainty in\nthe inverse PDE PINN solutions. Recently, the Bayesian PINN (BPINN) method was\nproposed, where the posterior distribution of the PINN parameters was\nformulated using the Bayes' theorem and sampled using approximate inference\nmethods such as the Hamiltonian Monte Carlo (HMC) and variational inference\n(VI) methods. In this work, we demonstrate that HMC fails to converge for\nnon-linear inverse PDE problems. As an alternative to HMC, we sample the\ndistribution by solving the stochastic optimization problem obtained by\nrandomizing the PINN loss function. The effectiveness of the rPINN method is\ntested for linear and non-linear Poisson equations, and the diffusion equation\nwith a high-dimensional space-dependent diffusion coefficient. The rPINN method\nprovides informative distributions for all considered problems. For the linear\nPoisson equation, HMC and rPINN produce similar distributions, but rPINN is on\naverage 27 times faster than HMC. For the non-linear Poison and diffusion\nequations, the HMC method fails to converge because a single HMC chain cannot\nsample multiple modes of the posterior distribution of the PINN parameters in a\nreasonable amount of time.\n","authors":["Yifei Zong","David Barajas-Solano","Alexandre M. Tartakovsky"],"pdf_url":"https://arxiv.org/pdf/2407.04617v1.pdf","comment":"38 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.00666v2","updated":"2024-07-05T16:15:53Z","published":"2024-03-31T12:21:57Z","title":"Accelerated Parameter-Free Stochastic Optimization","summary":" We propose a method that achieves near-optimal rates for smooth stochastic\nconvex optimization and requires essentially no prior knowledge of problem\nparameters. This improves on prior work which requires knowing at least the\ninitial distance to optimality d0. Our method, U-DoG, combines UniXGrad (Kavis\net al., 2019) and DoG (Ivgi et al., 2023) with novel iterate stabilization\ntechniques. It requires only loose bounds on d0 and the noise magnitude,\nprovides high probability guarantees under sub-Gaussian noise, and is also\nnear-optimal in the non-smooth case. Our experiments show consistent, strong\nperformance on convex problems and mixed results on neural network training.\n","authors":["Itai Kreisler","Maor Ivgi","Oliver Hinder","Yair Carmon"],"pdf_url":"https://arxiv.org/pdf/2404.00666v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04616v1","updated":"2024-07-05T16:14:53Z","published":"2024-07-05T16:14:53Z","title":"Isomorphic Pruning for Vision Models","summary":" Structured pruning reduces the computational overhead of deep neural networks\nby removing redundant sub-structures. However, assessing the relative\nimportance of different sub-structures remains a significant challenge,\nparticularly in advanced vision models featuring novel mechanisms and\narchitectures like self-attention, depth-wise convolutions, or residual\nconnections. These heterogeneous substructures usually exhibit diverged\nparameter scales, weight distributions, and computational topology, introducing\nconsiderable difficulty to importance comparison. To overcome this, we present\nIsomorphic Pruning, a simple approach that demonstrates effectiveness across a\nrange of network architectures such as Vision Transformers and CNNs, and\ndelivers competitive performance across different model sizes. Isomorphic\nPruning originates from an observation that, when evaluated under a pre-defined\nimportance criterion, heterogeneous sub-structures demonstrate significant\ndivergence in their importance distribution, as opposed to isomorphic\nstructures that present similar importance patterns. This inspires us to\nperform isolated ranking and comparison on different types of sub-structures\nfor more reliable pruning. Our empirical results on ImageNet-1K demonstrate\nthat Isomorphic Pruning surpasses several pruning baselines dedicatedly\ndesigned for Transformers or CNNs. For instance, we improve the accuracy of\nDeiT-Tiny from 74.52% to 77.50% by pruning an off-the-shelf DeiT-Base model.\nAnd for ConvNext-Tiny, we enhanced performance from 82.06% to 82.18%, while\nreducing the number of parameters and memory usage. Code is available at\n\\url{https://github.com/VainF/Isomorphic-Pruning}.\n","authors":["Gongfan Fang","Xinyin Ma","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07322v3","updated":"2024-07-05T16:11:43Z","published":"2024-03-12T05:15:42Z","title":"A Question-centric Multi-experts Contrastive Learning Framework for\n Improving the Accuracy and Interpretability of Deep Sequential Knowledge\n Tracing Models","summary":" Knowledge tracing (KT) plays a crucial role in predicting students' future\nperformance by analyzing their historical learning processes. Deep neural\nnetworks (DNNs) have shown great potential in solving the KT problem. However,\nthere still exist some important challenges when applying deep learning\ntechniques to model the KT process. The first challenge lies in taking the\nindividual information of the question into modeling. This is crucial because,\ndespite questions sharing the same knowledge component (KC), students'\nknowledge acquisition on homogeneous questions can vary significantly. The\nsecond challenge lies in interpreting the prediction results from existing deep\nlearning-based KT models. In real-world applications, while it may not be\nnecessary to have complete transparency and interpretability of the model\nparameters, it is crucial to present the model's prediction results in a manner\nthat teachers find interpretable. This makes teachers accept the rationale\nbehind the prediction results and utilize them to design teaching activities\nand tailored learning strategies for students. However, the inherent black-box\nnature of deep learning techniques often poses a hurdle for teachers to fully\nembrace the model's prediction results. To address these challenges, we propose\na Question-centric Multi-experts Contrastive Learning framework for KT called\nQ-MCKT. We have provided all the datasets and code on our website at\nhttps://github.com/rattlesnakey/Q-MCKT.\n","authors":["Hengyuan Zhang","Zitao Liu","Chenming Shang","Dawei Li","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.07322v3.pdf","comment":"25 pages, 9 figures, Accepted by TKDD"},{"id":"http://arxiv.org/abs/2208.06263v3","updated":"2024-07-05T16:05:41Z","published":"2022-08-10T13:18:00Z","title":"Probabilistic Rank and Reward: A Scalable Model for Slate Recommendation","summary":" We introduce Probabilistic Rank and Reward (PRR), a scalable probabilistic\nmodel for personalized slate recommendation. Our approach allows off-policy\nestimation of the reward in the scenario where the user interacts with at most\none item from a slate of K items. We show that the probability of a slate being\nsuccessful can be learned efficiently by combining the reward, whether the user\nsuccessfully interacted with the slate, and the rank, the item that was\nselected within the slate. PRR outperforms existing off-policy reward\noptimizing methods and is far more scalable to large action spaces. Moreover,\nPRR allows fast delivery of recommendations powered by maximum inner product\nsearch (MIPS), making it suitable in low latency domains such as computational\nadvertising.\n","authors":["Imad Aouali","Achraf Ait Sidi Hammou","Otmane Sakhi","David Rohde","Flavian Vasile"],"pdf_url":"https://arxiv.org/pdf/2208.06263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02989v2","updated":"2024-07-05T16:04:15Z","published":"2024-02-05T13:27:41Z","title":"DexDiffuser: Generating Dexterous Grasps with Diffusion Models","summary":" We introduce DexDiffuser, a novel dexterous grasping method that generates,\nevaluates, and refines grasps on partial object point clouds. DexDiffuser\nincludes the conditional diffusion-based grasp sampler DexSampler and the\ndexterous grasp evaluator DexEvaluator. DexSampler generates high-quality\ngrasps conditioned on object point clouds by iterative denoising of randomly\nsampled grasps. We also introduce two grasp refinement strategies:\nEvaluator-Guided Diffusion (EGD) and Evaluator-based Sampling Refinement (ESR).\nThe experiment results demonstrate that DexDiffuser consistently outperforms\nthe state-of-the-art multi-finger grasp generation method FFHNet with an, on\naverage, 9.12% and 19.44% higher grasp success rate in simulation and real\nrobot experiments, respectively. Supplementary materials are available at\nhttps://yulihn.github.io/DexDiffuser_page/\n","authors":["Zehang Weng","Haofei Lu","Danica Kragic","Jens Lundell"],"pdf_url":"https://arxiv.org/pdf/2402.02989v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2407.04605v1","updated":"2024-07-05T15:53:16Z","published":"2024-07-05T15:53:16Z","title":"Linear causal disentanglement via higher-order cumulants","summary":" Linear causal disentanglement is a recent method in causal representation\nlearning to describe a collection of observed variables via latent variables\nwith causal dependencies between them. It can be viewed as a generalization of\nboth independent component analysis and linear structural equation models. We\nstudy the identifiability of linear causal disentanglement, assuming access to\ndata under multiple contexts, each given by an intervention on a latent\nvariable. We show that one perfect intervention on each latent variable is\nsufficient and in the worst case necessary to recover parameters under perfect\ninterventions, generalizing previous work to allow more latent than observed\nvariables. We give a constructive proof that computes parameters via a coupled\ntensor decomposition. For soft interventions, we find the equivalence class of\nlatent graphs and parameters that are consistent with observed data, via the\nstudy of a system of polynomial equations. Our results hold assuming the\nexistence of non-zero higher-order cumulants, which implies non-Gaussianity of\nvariables.\n","authors":["Paula Leyes Carreno","Chiara Meroni","Anna Seigal"],"pdf_url":"https://arxiv.org/pdf/2407.04605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11325v2","updated":"2024-07-05T15:51:16Z","published":"2024-06-17T08:38:29Z","title":"Deep-Learning-Based Channel Estimation for Distributed MIMO with 1-bit\n Radio-Over-Fiber Fronthaul","summary":" We consider the problem of pilot-aided, uplink channel estimation in a\ndistributed massive multiple-input multiple-output (MIMO) architecture, in\nwhich the access points are connected to a central processing unit via\nfiber-optical fronthaul links, carrying a two-level-quantized version of the\nreceived analog radio-frequency signal. We adapt to this architecture the\ndeep-learning-based channel-estimation algorithm recently proposed by Nguyen et\nal. (2023), and explore its robustness to the additional signal distortions\n(beyond 1-bit quantization) introduced in the considered architecture by the\nautomatic gain controllers (AGCs) and by the comparators. These components are\nused at the access points to generate the two-level analog waveform from the\nreceived signal. Via simulation results, we illustrate that the proposed\nchannel-estimation method outperforms significantly the Bussgang linear minimum\nmean-square error channel estimator, and it is robust against the additional\nimpairments introduced by the AGCs and the comparators.\n","authors":["Alireza Bordbar","Lise Aabel","Christian Häger","Christian Fager","Giuseppe Durisi"],"pdf_url":"https://arxiv.org/pdf/2406.11325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04600v1","updated":"2024-07-05T15:48:34Z","published":"2024-07-05T15:48:34Z","title":"Understanding the Gains from Repeated Self-Distillation","summary":" Self-Distillation is a special type of knowledge distillation where the\nstudent model has the same architecture as the teacher model. Despite using the\nsame architecture and the same training data, self-distillation has been\nempirically observed to improve performance, especially when applied\nrepeatedly. For such a process, there is a fundamental question of interest:\nHow much gain is possible by applying multiple steps of self-distillation? To\ninvestigate this relative gain, we propose studying the simple but canonical\ntask of linear regression. Our analysis shows that the excess risk achieved by\nmulti-step self-distillation can significantly improve upon a single step of\nself-distillation, reducing the excess risk by a factor as large as $d$, where\n$d$ is the input dimension. Empirical results on regression tasks from the UCI\nrepository show a reduction in the learnt model's risk (MSE) by up to 47%.\n","authors":["Divyansh Pareek","Simon S. Du","Sewoong Oh"],"pdf_url":"https://arxiv.org/pdf/2407.04600v1.pdf","comment":"31 pages, 10 figures"},{"id":"http://arxiv.org/abs/2304.02011v3","updated":"2024-07-05T15:43:02Z","published":"2023-04-04T17:59:09Z","title":"FakET: Simulating Cryo-Electron Tomograms with Neural Style Transfer","summary":" In cryo-electron microscopy, accurate particle localization and\nclassification are imperative. Recent deep learning solutions, though\nsuccessful, require extensive training data sets. The protracted generation\ntime of physics-based models, often employed to produce these data sets, limits\ntheir broad applicability. We introduce FakET, a method based on Neural Style\nTransfer, capable of simulating the forward operator of any cryo transmission\nelectron microscope. It can be used to adapt a synthetic training data set\naccording to reference data producing high-quality simulated micrographs or\ntilt-series. To assess the quality of our generated data, we used it to train a\nstate-of-the-art localization and classification architecture and compared its\nperformance with a counterpart trained on benchmark data. Remarkably, our\ntechnique matches the performance, boosts data generation speed 750 times, uses\n33 times less memory, and scales well to typical transmission electron\nmicroscope detector sizes. It leverages GPU acceleration and parallel\nprocessing. The source code is available at https://github.com/paloha/faket.\n","authors":["Pavol Harar","Lukas Herrmann","Philipp Grohs","David Haselbach"],"pdf_url":"https://arxiv.org/pdf/2304.02011v3.pdf","comment":"25 pages, 3 tables, 19 figures including supplement. Added Key\n findings section, CPU-profiling appendix, and Supplementary information"},{"id":"http://arxiv.org/abs/2403.13658v3","updated":"2024-07-05T15:42:25Z","published":"2024-03-20T15:06:49Z","title":"Multimodal Variational Autoencoder for Low-cost Cardiac Hemodynamics\n Instability Detection","summary":" Recent advancements in non-invasive detection of cardiac hemodynamic\ninstability (CHDI) primarily focus on applying machine learning techniques to a\nsingle data modality, e.g. cardiac magnetic resonance imaging (MRI). Despite\ntheir potential, these approaches often fall short especially when the size of\nlabeled patient data is limited, a common challenge in the medical domain.\nFurthermore, only a few studies have explored multimodal methods to study CHDI,\nwhich mostly rely on costly modalities such as cardiac MRI and echocardiogram.\nIn response to these limitations, we propose a novel multimodal variational\nautoencoder ($\\text{CardioVAE}_\\text{X,G}$) to integrate low-cost chest X-ray\n(CXR) and electrocardiogram (ECG) modalities with pre-training on a large\nunlabeled dataset. Specifically, $\\text{CardioVAE}_\\text{X,G}$ introduces a\nnovel tri-stream pre-training strategy to learn both shared and\nmodality-specific features, thus enabling fine-tuning with both unimodal and\nmultimodal datasets. We pre-train $\\text{CardioVAE}_\\text{X,G}$ on a large,\nunlabeled dataset of $50,982$ subjects from a subset of MIMIC database and then\nfine-tune the pre-trained model on a labeled dataset of $795$ subjects from the\nASPIRE registry. Comprehensive evaluations against existing methods show that\n$\\text{CardioVAE}_\\text{X,G}$ offers promising performance (AUROC $=0.79$ and\nAccuracy $=0.77$), representing a significant step forward in non-invasive\nprediction of CHDI. Our model also excels in producing fine interpretations of\npredictions directly associated with clinical features, thereby supporting\nclinical decision-making.\n","authors":["Mohammod N. I. Suvon","Prasun C. Tripathi","Wenrui Fan","Shuo Zhou","Xianyuan Liu","Samer Alabed","Venet Osmani","Andrew J. Swift","Chen Chen","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2403.13658v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04591v1","updated":"2024-07-05T15:40:15Z","published":"2024-07-05T15:40:15Z","title":"Proximal Point Method for Online Saddle Point Problem","summary":" This paper focuses on the online saddle point problem, which involves a\nsequence of two-player time-varying convex-concave games. Considering the\nnonstationarity of the environment, we adopt the duality gap and the dynamic\nNash equilibrium regret as performance metrics for algorithm design. We present\nthree variants of the proximal point method: the Online Proximal Point\nMethod~(OPPM), the Optimistic OPPM~(OptOPPM), and the OptOPPM with multiple\npredictors. Each algorithm guarantees upper bounds for both the duality gap and\ndynamic Nash equilibrium regret, achieving near-optimality when measured\nagainst the duality gap. Specifically, in certain benign environments, such as\nsequences of stationary payoff functions, these algorithms maintain a nearly\nconstant metric bound. Experimental results further validate the effectiveness\nof these algorithms. Lastly, this paper discusses potential reliability\nconcerns associated with using dynamic Nash equilibrium regret as a performance\nmetric.\n","authors":["Qing-xin Meng","Jian-wei Liu"],"pdf_url":"https://arxiv.org/pdf/2407.04591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13213v4","updated":"2024-07-05T15:40:13Z","published":"2024-03-20T00:22:38Z","title":"From Representational Harms to Quality-of-Service Harms: A Case Study on\n Llama 2 Safety Safeguards","summary":" Recent progress in large language models (LLMs) has led to their widespread\nadoption in various domains. However, these advancements have also introduced\nadditional safety risks and raised concerns regarding their detrimental impact\non already marginalized populations. Despite growing mitigation efforts to\ndevelop safety safeguards, such as supervised safety-oriented fine-tuning and\nleveraging safe reinforcement learning from human feedback, multiple concerns\nregarding the safety and ingrained biases in these models remain. Furthermore,\nprevious work has demonstrated that models optimized for safety often display\nexaggerated safety behaviors, such as a tendency to refrain from responding to\ncertain requests as a precautionary measure. As such, a clear trade-off between\nthe helpfulness and safety of these models has been documented in the\nliterature. In this paper, we further investigate the effectiveness of safety\nmeasures by evaluating models on already mitigated biases. Using the case of\nLlama 2 as an example, we illustrate how LLMs' safety responses can still\nencode harmful assumptions. To do so, we create a set of non-toxic prompts,\nwhich we then use to evaluate Llama models. Through our new taxonomy of LLMs\nresponses to users, we observe that the safety/helpfulness trade-offs are more\npronounced for certain demographic groups which can lead to quality-of-service\nharms for marginalized populations.\n","authors":["Khaoula Chehbouni","Megha Roshan","Emmanuel Ma","Futian Andrew Wei","Afaf Taik","Jackie CK Cheung","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2403.13213v4.pdf","comment":"9 pages, 4 figures. Accepted to Findings of the Association for\n Computational Linguistics: ACL 2024"},{"id":"http://arxiv.org/abs/2407.04589v1","updated":"2024-07-05T15:38:36Z","published":"2024-07-05T15:38:36Z","title":"Remembering Everything Makes You Vulnerable: A Limelight on Machine\n Unlearning for Personalized Healthcare Sector","summary":" As the prevalence of data-driven technologies in healthcare continues to\nrise, concerns regarding data privacy and security become increasingly\nparamount. This thesis aims to address the vulnerability of personalized\nhealthcare models, particularly in the context of ECG monitoring, to\nadversarial attacks that compromise patient privacy. We propose an approach\ntermed \"Machine Unlearning\" to mitigate the impact of exposed data points on\nmachine learning models, thereby enhancing model robustness against adversarial\nattacks while preserving individual privacy. Specifically, we investigate the\nefficacy of Machine Unlearning in the context of personalized ECG monitoring,\nutilizing a dataset of clinical ECG recordings. Our methodology involves\ntraining a deep neural classifier on ECG data and fine-tuning the model for\nindividual patients. We demonstrate the susceptibility of fine-tuned models to\nadversarial attacks, such as the Fast Gradient Sign Method (FGSM), which can\nexploit additional data points in personalized models. To address this\nvulnerability, we propose a Machine Unlearning algorithm that selectively\nremoves sensitive data points from fine-tuned models, effectively enhancing\nmodel resilience against adversarial manipulation. Experimental results\ndemonstrate the effectiveness of our approach in mitigating the impact of\nadversarial attacks while maintaining the pre-trained model accuracy.\n","authors":["Ahan Chatterjee","Sai Anirudh Aryasomayajula","Rajat Chaudhari","Subhajit Paul","Vishwa Mohan Singh"],"pdf_url":"https://arxiv.org/pdf/2407.04589v1.pdf","comment":"15 Pages, Exploring unlearning techniques on ECG Classifier"},{"id":"http://arxiv.org/abs/2406.02584v2","updated":"2024-07-05T15:37:15Z","published":"2024-05-30T20:48:10Z","title":"Planetary Causal Inference: Implications for the Geography of Poverty","summary":" Earth observation data such as satellite imagery can, when combined with\nmachine learning, can have far-reaching impacts on our understanding of the\ngeography of poverty through the prediction of living conditions, especially\nwhere government-derived economic indicators are either unavailable or\npotentially untrustworthy. Recent work has progressed in using Earth\nObservation (EO) data not only to predict spatial economic outcomes but also to\nexplore cause and effect, an understanding which is critical for downstream\npolicy analysis. In this review, we first document the growth of interest in\nusing satellite images together with EO data in causal analysis. We then trace\nthe relationship between spatial statistics and machine learning methods before\ndiscussing four ways in which EO data has been used in causal machine learning\npipelines -- (1.) poverty outcome imputation for downstream causal analysis,\n(2.) EO image deconfounding, (3.) EO-based treatment effect heterogeneity, and\n(4.) EO-based transportability analysis. We conclude by providing a\nstep-by-step workflow for how researchers can incorporate EO data in causal ML\nanalysis going forward, outlining major choices of data, models, and evaluation\nmetrics.\n","authors":["Kazuki Sakamoto","Connor T. Jerzak","Adel Daoud"],"pdf_url":"https://arxiv.org/pdf/2406.02584v2.pdf","comment":"For a full list of the papers found in the quantitative literature\n search, see https://github.com/AIandGlobalDevelopmentLab/eo-poverty-review"},{"id":"http://arxiv.org/abs/2407.04587v1","updated":"2024-07-05T15:32:07Z","published":"2024-07-05T15:32:07Z","title":"Multimodal Classification via Modal-Aware Interactive Enhancement","summary":" Due to the notorious modality imbalance problem, multimodal learning (MML)\nleads to the phenomenon of optimization imbalance, thus struggling to achieve\nsatisfactory performance. Recently, some representative methods have been\nproposed to boost the performance, mainly focusing on adaptive adjusting the\noptimization of each modality to rebalance the learning speed of dominant and\nnon-dominant modalities. To better facilitate the interaction of model\ninformation in multimodal learning, in this paper, we propose a novel\nmultimodal learning method, called modal-aware interactive enhancement (MIE).\nSpecifically, we first utilize an optimization strategy based on sharpness\naware minimization (SAM) to smooth the learning objective during the forward\nphase. Then, with the help of the geometry property of SAM, we propose a\ngradient modification strategy to impose the influence between different\nmodalities during the backward phase. Therefore, we can improve the\ngeneralization ability and alleviate the modality forgetting phenomenon\nsimultaneously for multimodal learning. Extensive experiments on widely used\ndatasets demonstrate that our proposed method can outperform various\nstate-of-the-art baselines to achieve the best performance.\n","authors":["Qing-Yuan Jiang","Zhouyang Chi","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2407.04587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06681v4","updated":"2024-07-05T15:30:45Z","published":"2023-12-09T04:40:46Z","title":"Steering Llama 2 via Contrastive Activation Addition","summary":" We introduce Contrastive Activation Addition (CAA), an innovative method for\nsteering language models by modifying their activations during forward passes.\nCAA computes \"steering vectors\" by averaging the difference in residual stream\nactivations between pairs of positive and negative examples of a particular\nbehavior, such as factual versus hallucinatory responses. During inference,\nthese steering vectors are added at all token positions after the user's prompt\nwith either a positive or negative coefficient, allowing precise control over\nthe degree of the targeted behavior. We evaluate CAA's effectiveness on Llama 2\nChat using multiple-choice behavioral question datasets and open-ended\ngeneration tasks. We demonstrate that CAA significantly alters model behavior,\nis effective over and on top of traditional methods like finetuning and system\nprompt design, and minimally reduces capabilities. Moreover, we gain deeper\ninsights into CAA's mechanisms by employing various activation space\ninterpretation methods. CAA accurately steers model outputs and sheds light on\nhow high-level concepts are represented in Large Language Models (LLMs).\n","authors":["Nina Panickssery","Nick Gabrieli","Julian Schulz","Meg Tong","Evan Hubinger","Alexander Matt Turner"],"pdf_url":"https://arxiv.org/pdf/2312.06681v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04343v2","updated":"2024-07-05T15:27:01Z","published":"2023-04-10T01:12:09Z","title":"Certifiable Black-Box Attacks with Randomized Adversarial Examples:\n Breaking Defenses with Provable Confidence","summary":" Black-box adversarial attacks have shown strong potential to subvert machine\nlearning models. Existing black-box attacks craft adversarial examples by\niteratively querying the target model and/or leveraging the transferability of\na local surrogate model. Recently, such attacks can be effectively mitigated by\nstate-of-the-art (SOTA) defenses, e.g., detection via the pattern of sequential\nqueries, or injecting noise into the model. To our best knowledge, we take the\nfirst step to study a new paradigm of black-box attacks with provable\nguarantees -- certifiable black-box attacks that can guarantee the attack\nsuccess probability (ASP) of adversarial examples before querying over the\ntarget model. This new black-box attack unveils significant vulnerabilities of\nmachine learning models, compared to traditional empirical black-box attacks,\ne.g., breaking strong SOTA defenses with provable confidence, constructing a\nspace of (infinite) adversarial examples with high ASP, and the ASP of the\ngenerated adversarial examples is theoretically guaranteed without\nverification/queries over the target model. Specifically, we establish a novel\ntheoretical foundation for ensuring the ASP of the black-box attack with\nrandomized adversarial examples (AEs). Then, we propose several novel\ntechniques to craft the randomized AEs while reducing the perturbation size for\nbetter imperceptibility. Finally, we have comprehensively evaluated the\ncertifiable black-box attacks on the CIFAR10/100, ImageNet, and LibriSpeech\ndatasets, while benchmarking with 16 SOTA empirical black-box attacks, against\nvarious SOTA defenses in the domains of computer vision and speech recognition.\nBoth theoretical and experimental results have validated the significance of\nthe proposed attack.\n","authors":["Hanbin Hong","Xinyu Zhang","Binghui Wang","Zhongjie Ba","Yuan Hong"],"pdf_url":"https://arxiv.org/pdf/2304.04343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18119v2","updated":"2024-07-05T15:23:58Z","published":"2024-05-28T12:28:12Z","title":"Low-Resource Crop Classification from Multi-Spectral Time Series Using\n Lossless Compressors","summary":" Deep learning has significantly improved the accuracy of crop classification\nusing multispectral temporal data. However, these models have complex\nstructures with numerous parameters, requiring large amounts of data and costly\ntraining. In low-resource situations with fewer labeled samples, deep learning\nmodels perform poorly due to insufficient data. Conversely, compressors are\ndata-type agnostic, and non-parametric methods do not bring underlying\nassumptions. Inspired by this insight, we propose a non-training alternative to\ndeep learning models, aiming to address these situations. Specifically, the\nSymbolic Representation Module is proposed to convert the reflectivity into\nsymbolic representations. The symbolic representations are then\ncross-transformed in both the channel and time dimensions to generate symbolic\nembeddings. Next, the Multi-scale Normalised Compression Distance (MNCD) is\ndesigned to measure the correlation between any two symbolic embeddings.\nFinally, based on the MNCDs, high quality crop classification can be achieved\nusing only a k-nearest-neighbor classifier kNN. The entire framework is\nready-to-use and lightweight. Without any training, it outperformed, on\naverage, 7 advanced deep learning models trained at scale on three benchmark\ndatasets. It also outperforms more than half of these models in the few-shot\nsetting with sparse crop labels. Therefore, the high performance and robustness\nof our non-training framework makes it truly applicable to real-world crop\nmapping. Codes are available at:\nhttps://github.com/qinfengsama/Compressor-Based-Crop-Mapping.\n","authors":["Wei Cheng","Hongrui Ye","Xiao Wen","Jiachen Zhang","Jiping Xu","Feifan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.18119v2.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.04581v1","updated":"2024-07-05T15:23:43Z","published":"2024-07-05T15:23:43Z","title":"Leveraging Large Language Models for Integrated\n Satellite-Aerial-Terrestrial Networks: Recent Advances and Future Directions","summary":" Integrated satellite, aerial, and terrestrial networks (ISATNs) represent a\nsophisticated convergence of diverse communication technologies to ensure\nseamless connectivity across different altitudes and platforms. This paper\nexplores the transformative potential of integrating Large Language Models\n(LLMs) into ISATNs, leveraging advanced Artificial Intelligence (AI) and\nMachine Learning (ML) capabilities to enhance these networks. We outline the\ncurrent architecture of ISATNs and highlight the significant role LLMs can play\nin optimizing data flow, signal processing, and network management to advance\n5G/6G communication technologies through advanced predictive algorithms and\nreal-time decision-making. A comprehensive analysis of ISATN components is\nconducted, assessing how LLMs can effectively address traditional data\ntransmission and processing bottlenecks. The paper delves into the network\nmanagement challenges within ISATNs, emphasizing the necessity for\nsophisticated resource allocation strategies, traffic routing, and security\nmanagement to ensure seamless connectivity and optimal performance under\nvarying conditions. Furthermore, we examine the technical challenges and\nlimitations associated with integrating LLMs into ISATNs, such as data\nintegration for LLM processing, scalability issues, latency in decision-making\nprocesses, and the design of robust, fault-tolerant systems. The study also\nidentifies key future research directions for fully harnessing LLM capabilities\nin ISATNs, which is crucial for enhancing network reliability, optimizing\nperformance, and achieving a truly interconnected and intelligent global\nnetwork system.\n","authors":["Shumaila Javaid","Ruhul Amin Khalil","Nasir Saeed","Bin He","Mohamed-Slim Alouini"],"pdf_url":"https://arxiv.org/pdf/2407.04581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04579v1","updated":"2024-07-05T15:16:25Z","published":"2024-07-05T15:16:25Z","title":"GOALPlace: Begin with the End in Mind","summary":" Co-optimizing placement with congestion is integral to achieving high-quality\ndesigns. This paper presents GOALPlace, a new learning-based general approach\nto improving placement congestion by controlling cell density. Our method\nefficiently learns from an EDA tool's post-route optimized results and uses an\nempirical Bayes technique to adapt this goal/target to a specific placer's\nsolutions, effectively beginning with the end in mind. It enhances correlation\nwith the long-running heuristics of the tool's router and timing-opt engine --\nwhile solving placement globally without expensive incremental congestion\nestimation and mitigation methods. A statistical analysis with a new\nhierarchical netlist clustering establishes the importance of density and the\npotential for an adequate cell density target across placements. Our\nexperiments show that our method, integrated as a demonstration inside an\nacademic GPU-accelerated global placer, consistently produces macro and\nstandard cell placements of superior or comparable quality to commercial tools.\nOur empirical Bayes methodology also allows a substantial quality improvement\nover state-of-the-art academic mixed-size placers, achieving up to 10x fewer\ndesign rule check (DRC) violations, a 5% decrease in wirelength, and a 30% and\n60% reduction in worst and total negative slack (WNS/TNS).\n","authors":["Anthony Agnesina","Rongjian Liang","Geraldo Pradipta","Anand Rajaram","Haoxing Ren"],"pdf_url":"https://arxiv.org/pdf/2407.04579v1.pdf","comment":"10 pages, 7 figures, preprint"},{"id":"http://arxiv.org/abs/2402.07355v4","updated":"2024-07-05T15:15:36Z","published":"2024-02-12T01:04:39Z","title":"Sampling from the Mean-Field Stationary Distribution","summary":" We study the complexity of sampling from the stationary distribution of a\nmean-field SDE, or equivalently, the complexity of minimizing a functional over\nthe space of probability measures which includes an interaction term. Our main\ninsight is to decouple the two key aspects of this problem: (1) approximation\nof the mean-field SDE via a finite-particle system, via uniform-in-time\npropagation of chaos, and (2) sampling from the finite-particle stationary\ndistribution, via standard log-concave samplers. Our approach is conceptually\nsimpler and its flexibility allows for incorporating the state-of-the-art for\nboth algorithms and theory. This leads to improved guarantees in numerous\nsettings, including better guarantees for optimizing certain two-layer neural\nnetworks in the mean-field regime. A key technical contribution is to establish\na new uniform-in-$N$ log-Sobolev inequality for the stationary distribution of\nthe mean-field Langevin dynamics.\n","authors":["Yunbum Kook","Matthew S. Zhang","Sinho Chewi","Murat A. Erdogdu","Mufan Bill Li"],"pdf_url":"https://arxiv.org/pdf/2402.07355v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09673v3","updated":"2024-07-05T14:51:55Z","published":"2024-01-18T01:18:59Z","title":"Artwork Protection Against Neural Style Transfer Using Locally Adaptive\n Adversarial Color Attack","summary":" Neural style transfer (NST) generates new images by combining the style of\none image with the content of another. However, unauthorized NST can exploit\nartwork, raising concerns about artists' rights and motivating the development\nof proactive protection methods. We propose Locally Adaptive Adversarial Color\nAttack (LAACA), empowering artists to protect their artwork from unauthorized\nstyle transfer by processing before public release. By delving into the\nintricacies of human visual perception and the role of different frequency\ncomponents, our method strategically introduces frequency-adaptive\nperturbations in the image. These perturbations significantly degrade the\ngeneration quality of NST while maintaining an acceptable level of visual\nchange in the original image, ensuring that potential infringers are\ndiscouraged from using the protected artworks, because of its bad NST\ngeneration quality. Additionally, existing metrics often overlook the\nimportance of color fidelity in evaluating color-mattered tasks, such as the\nquality of NST-generated images, which is crucial in the context of artistic\nworks. To comprehensively assess the color-mattered tasks, we propose the\nAdversarial Color Distance Metric (ACDM), designed to quantify the color\ndifference of images pre- and post-manipulations. Experimental results confirm\nthat attacking NST using LAACA results in visually inferior style transfer, and\nthe ACDM can efficiently measure color-mattered tasks. By providing artists\nwith a tool to safeguard their intellectual property, our work relieves the\nsocio-technical challenges posed by the misuse of NST in the art community.\n","authors":["Zhongliang Guo","Junhao Dong","Yifei Qian","Kaixuan Wang","Weiye Li","Ziheng Guo","Yuheng Wang","Yanli Li","Ognjen Arandjelović","Lei Fang"],"pdf_url":"https://arxiv.org/pdf/2401.09673v3.pdf","comment":"9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2407.04559v1","updated":"2024-07-05T14:48:15Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n than Measuring Coherence, Grounding, and Repetition","summary":" Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04557v1","updated":"2024-07-05T14:42:54Z","published":"2024-07-05T14:42:54Z","title":"Structural Constraint Integration in Generative Model for Discovery of\n Quantum Material Candidates","summary":" Billions of organic molecules are known, but only a tiny fraction of the\nfunctional inorganic materials have been discovered, a particularly relevant\nproblem to the community searching for new quantum materials. Recent\nadvancements in machine-learning-based generative models, particularly\ndiffusion models, show great promise for generating new, stable materials.\nHowever, integrating geometric patterns into materials generation remains a\nchallenge. Here, we introduce Structural Constraint Integration in the\nGENerative model (SCIGEN). Our approach can modify any trained generative\ndiffusion model by strategic masking of the denoised structure with a diffused\nconstrained structure prior to each diffusion step to steer the generation\ntoward constrained outputs. Furthermore, we mathematically prove that SCIGEN\neffectively performs conditional sampling from the original distribution, which\nis crucial for generating stable constrained materials. We generate eight\nmillion compounds using Archimedean lattices as prototype constraints, with\nover 10% surviving a multi-staged stability pre-screening. High-throughput\ndensity functional theory (DFT) on 26,000 survived compounds shows that over\n50% passed structural optimization at the DFT level. Since the properties of\nquantum materials are closely related to geometric patterns, our results\nindicate that SCIGEN provides a general framework for generating quantum\nmaterials candidates.\n","authors":["Ryotaro Okabe","Mouyang Cheng","Abhijatmedhi Chotrattanapituk","Nguyen Tuan Hung","Xiang Fu","Bowen Han","Yao Wang","Weiwei Xie","Robert J. Cava","Tommi S. Jaakkola","Yongqiang Cheng","Mingda Li"],"pdf_url":"https://arxiv.org/pdf/2407.04557v1.pdf","comment":"512 pages total, 4 main figures + 218 supplementary figures"},{"id":"http://arxiv.org/abs/2407.04551v1","updated":"2024-07-05T14:36:19Z","published":"2024-07-05T14:36:19Z","title":"An AI Architecture with the Capability to Classify and Explain Hardware\n Trojans","summary":" Hardware trojan detection methods, based on machine learning (ML) techniques,\nmainly identify suspected circuits but lack the ability to explain how the\ndecision was arrived at. An explainable methodology and architecture is\nintroduced based on the existing hardware trojan detection features. Results\nare provided for explaining digital hardware trojans within a netlist using\ntrust-hub trojan benchmarks.\n","authors":["Paul Whitten","Francis Wolff","Chris Papachristou"],"pdf_url":"https://arxiv.org/pdf/2407.04551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04547v1","updated":"2024-07-05T14:32:52Z","published":"2024-07-05T14:32:52Z","title":"Real-time Timbre Remapping with Differentiable DSP","summary":" Timbre is a primary mode of expression in diverse musical contexts. However,\nprevalent audio-driven synthesis methods predominantly rely on pitch and\nloudness envelopes, effectively flattening timbral expression from the input.\nOur approach draws on the concept of timbre analogies and investigates how\ntimbral expression from an input signal can be mapped onto controls for a\nsynthesizer. Leveraging differentiable digital signal processing, our method\nfacilitates direct optimization of synthesizer parameters through a novel\nfeature difference loss. This loss function, designed to learn relative timbral\ndifferences between musical events, prioritizes the subtleties of graded timbre\nmodulations within phrases, allowing for meaningful translations in a timbre\nspace. Using snare drum performances as a case study, where timbral expression\nis central, we demonstrate real-time timbre remapping from acoustic snare drums\nto a differentiable synthesizer modeled after the Roland TR-808.\n","authors":["Jordie Shier","Charalampos Saitis","Andrew Robertson","Andrew McPherson"],"pdf_url":"https://arxiv.org/pdf/2407.04547v1.pdf","comment":"Accepted for publication at the 24th International Conference on New\n Interfaces for Musical Expression in Utrecht, Netherlands"},{"id":"http://arxiv.org/abs/2407.04542v1","updated":"2024-07-05T14:29:12Z","published":"2024-07-05T14:29:12Z","title":"Rethinking Image Compression on the Web with Generative AI","summary":" The rapid growth of the Internet, driven by social media, web browsing, and\nvideo streaming, has made images central to the Web experience, resulting in\nsignificant data transfer and increased webpage sizes. Traditional image\ncompression methods, while reducing bandwidth, often degrade image quality.\nThis paper explores a novel approach using generative AI to reconstruct images\nat the edge or client-side. We develop a framework that leverages text prompts\nand provides additional conditioning inputs like Canny edges and color palettes\nto a text-to-image model, achieving up to 99.8% bandwidth savings in the best\ncases and 92.6% on average, while maintaining high perceptual similarity.\nEmpirical analysis and a user study show that our method preserves image\nmeaning and structure more effectively than traditional compression methods,\noffering a promising solution for reducing bandwidth usage and improving\nInternet affordability with minimal degradation in image quality.\n","authors":["Shayan Ali Hassan","Danish Humair","Ihsan Ayyub Qazi","Zafar Ayyub Qazi"],"pdf_url":"https://arxiv.org/pdf/2407.04542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06112v2","updated":"2024-07-05T14:28:33Z","published":"2023-11-10T15:27:07Z","title":"Turbulence Scaling from Deep Learning Diffusion Generative Models","summary":" Complex spatial and temporal structures are inherent characteristics of\nturbulent fluid flows and comprehending them poses a major challenge. This\ncomprehesion necessitates an understanding of the space of turbulent fluid flow\nconfigurations. We employ a diffusion-based generative model to learn the\ndistribution of turbulent vorticity profiles and generate snapshots of\nturbulent solutions to the incompressible Navier-Stokes equations. We consider\nthe inverse cascade in two spatial dimensions and generate diverse turbulent\nsolutions that differ from those in the training dataset. We analyze the\nstatistical scaling properties of the new turbulent profiles, calculate their\nstructure functions, energy power spectrum, velocity probability distribution\nfunction and moments of local energy dissipation. All the learnt scaling\nexponents are consistent with the expected Kolmogorov scaling. This agreement\nwith established turbulence characteristics provides strong evidence of the\nmodel's capability to capture essential features of real-world turbulence.\n","authors":["Tim Whittaker","Romuald A. Janik","Yaron Oz"],"pdf_url":"https://arxiv.org/pdf/2311.06112v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04541v1","updated":"2024-07-05T14:28:12Z","published":"2024-07-05T14:28:12Z","title":"PoPreRo: A New Dataset for Popularity Prediction of Romanian Reddit\n Posts","summary":" We introduce PoPreRo, the first dataset for Popularity Prediction of Romanian\nposts collected from Reddit. The PoPreRo dataset includes a varied compilation\nof post samples from five distinct subreddits of Romania, totaling 28,107 data\nsamples. Along with our novel dataset, we introduce a set of competitive models\nto be used as baselines for future research. Interestingly, the top-scoring\nmodel achieves an accuracy of 61.35% and a macro F1 score of 60.60% on the test\nset, indicating that the popularity prediction task on PoPreRo is very\nchallenging. Further investigations based on few-shot prompting the Falcon-7B\nLarge Language Model also point in the same direction. We thus believe that\nPoPreRo is a valuable resource that can be used to evaluate models on\npredicting the popularity of social media posts in Romanian. We release our\ndataset at https://github.com/ana-rogoz/PoPreRo.\n","authors":["Ana-Cristina Rogoz","Maria Ilinca Nechita","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2407.04541v1.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2305.17400v3","updated":"2024-07-05T14:26:21Z","published":"2023-05-27T07:55:17Z","title":"Query-Policy Misalignment in Preference-Based Reinforcement Learning","summary":" Preference-based reinforcement learning (PbRL) provides a natural way to\nalign RL agents' behavior with human desired outcomes, but is often restrained\nby costly human feedback. To improve feedback efficiency, most existing PbRL\nmethods focus on selecting queries to maximally improve the overall quality of\nthe reward model, but counter-intuitively, we find that this may not\nnecessarily lead to improved performance. To unravel this mystery, we identify\na long-neglected issue in the query selection schemes of existing PbRL studies:\nQuery-Policy Misalignment. We show that the seemingly informative queries\nselected to improve the overall quality of reward model actually may not align\nwith RL agents' interests, thus offering little help on policy learning and\neventually resulting in poor feedback efficiency. We show that this issue can\nbe effectively addressed via near on-policy query and a specially designed\nhybrid experience replay, which together enforce the bidirectional query-policy\nalignment. Simple yet elegant, our method can be easily incorporated into\nexisting approaches by changing only a few lines of code. We showcase in\ncomprehensive experiments that our method achieves substantial gains in both\nhuman feedback and RL sample efficiency, demonstrating the importance of\naddressing query-policy misalignment in PbRL tasks.\n","authors":["Xiao Hu","Jianxiong Li","Xianyuan Zhan","Qing-Shan Jia","Ya-Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17400v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2407.04540v1","updated":"2024-07-05T14:25:22Z","published":"2024-07-05T14:25:22Z","title":"Improved algorithms for learning quantum Hamiltonians, via flat\n polynomials","summary":" We give an improved algorithm for learning a quantum Hamiltonian given copies\nof its Gibbs state, that can succeed at any temperature. Specifically, we\nimprove over the work of Bakshi, Liu, Moitra, and Tang [BLMT24], by reducing\nthe sample complexity and runtime dependence to singly exponential in the\ninverse-temperature parameter, as opposed to doubly exponential. Our main\ntechnical contribution is a new flat polynomial approximation to the\nexponential function, with significantly lower degree than the flat polynomial\napproximation used in [BLMT24].\n","authors":["Shyam Narayanan"],"pdf_url":"https://arxiv.org/pdf/2407.04540v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2407.04538v1","updated":"2024-07-05T14:24:37Z","published":"2024-07-05T14:24:37Z","title":"PDiscoFormer: Relaxing Part Discovery Constraints with Vision\n Transformers","summary":" Computer vision methods that explicitly detect object parts and reason on\nthem are a step towards inherently interpretable models. Existing approaches\nthat perform part discovery driven by a fine-grained classification task make\nvery restrictive assumptions on the geometric properties of the discovered\nparts; they should be small and compact. Although this prior is useful in some\ncases, in this paper we show that pre-trained transformer-based vision models,\nsuch as self-supervised DINOv2 ViT, enable the relaxation of these constraints.\nIn particular, we find that a total variation (TV) prior, which allows for\nmultiple connected components of any size, substantially outperforms previous\nwork. We test our approach on three fine-grained classification benchmarks:\nCUB, PartImageNet and Oxford Flowers, and compare our results to previously\npublished methods as well as a re-implementation of the state-of-the-art method\nPDiscoNet with a transformer-based backbone. We consistently obtain substantial\nimprovements across the board, both on part discovery metrics and the\ndownstream classification task, showing that the strong inductive biases in\nself-supervised ViT models require to rethink the geometric priors that can be\nused for unsupervised part discovery.\n","authors":["Ananthu Aniraj","Cassio F. Dantas","Dino Ienco","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2407.04538v1.pdf","comment":"Accepted as a main conference paper at the European Conference of\n Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2407.04534v1","updated":"2024-07-05T14:22:13Z","published":"2024-07-05T14:22:13Z","title":"Introducing 'Inside' Out of Distribution","summary":" Detecting and understanding out-of-distribution (OOD) samples is crucial in\nmachine learning (ML) to ensure reliable model performance. Current OOD\nstudies, in general, and in the context of ML, in particular, primarily focus\non extrapolatory OOD (outside), neglecting potential cases of interpolatory OOD\n(inside). This study introduces a novel perspective on OOD by suggesting OOD\ncan be divided into inside and outside cases. In addition, following this\nframework, we examine the inside-outside OOD profiles of datasets and their\nimpact on ML model performance. Our analysis shows that different\ninside-outside OOD profiles lead to nuanced declines in ML model performance,\nhighlighting the importance of distinguishing between these two cases for\ndeveloping effective counter-OOD methods.\n","authors":["Teddy Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2407.04534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.16371v4","updated":"2024-07-05T14:18:18Z","published":"2022-10-28T19:14:03Z","title":"Distributed Black-box Attack: Do Not Overestimate Black-box Attacks","summary":" Black-box adversarial attacks can fool image classifiers into misclassifying\nimages without requiring access to model structure and weights. Recent studies\nhave reported attack success rates of over 95% with less than 1,000 queries.\nThe question then arises of whether black-box attacks have become a real threat\nagainst IoT devices that rely on cloud APIs to achieve image classification. To\nshed some light on this, note that prior research has primarily focused on\nincreasing the success rate and reducing the number of queries. However,\nanother crucial factor for black-box attacks against cloud APIs is the time\nrequired to perform the attack. This paper applies black-box attacks directly\nto cloud APIs rather than to local models, thereby avoiding mistakes made in\nprior research that applied the perturbation before image encoding and\npre-processing. Further, we exploit load balancing to enable distributed\nblack-box attacks that can reduce the attack time by a factor of about five for\nboth local search and gradient estimation methods.\n","authors":["Han Wu","Sareh Rowlands","Johan Wahlstrom"],"pdf_url":"https://arxiv.org/pdf/2210.16371v4.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.04528v1","updated":"2024-07-05T14:16:47Z","published":"2024-07-05T14:16:47Z","title":"GPT vs RETRO: Exploring the Intersection of Retrieval and\n Parameter-Efficient Fine-Tuning","summary":" Parameter-Efficient Fine-Tuning (PEFT) and Retrieval-Augmented Generation\n(RAG) have become popular methods for adapting large language models while\nminimizing compute requirements. In this paper, we apply PEFT methods\n(P-tuning, Adapters, and LoRA) to a modified Retrieval-Enhanced Transformer\n(RETRO) and a baseline GPT model across several sizes, ranging from 823 million\nto 48 billion parameters. We show that RETRO models outperform GPT models in\nzero-shot settings due to their unique pre-training process but GPT models have\nhigher performance potential with PEFT. Additionally, our study indicates that\n8B parameter models strike an optimal balance between cost and performance and\nP-tuning lags behind other PEFT techniques. We further provide a comparative\nanalysis of between applying PEFT to an Instruction-tuned RETRO model and base\nRETRO model. This work presents the first comprehensive comparison of various\nPEFT methods integrated with RAG, applied to both GPT and RETRO models,\nhighlighting their relative performance.\n","authors":["Aleksander Ficek","Jiaqi Zeng","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2407.04528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04525v1","updated":"2024-07-05T14:11:28Z","published":"2024-07-05T14:11:28Z","title":"Enhancing learning in artificial neural networks through cellular\n heterogeneity and neuromodulatory signaling","summary":" Recent progress in artificial intelligence (AI) has been driven by insights\nfrom neuroscience, particularly with the development of artificial neural\nnetworks (ANNs). This has significantly enhanced the replication of complex\ncognitive tasks such as vision and natural language processing. Despite these\nadvances, ANNs struggle with continual learning, adaptable knowledge transfer,\nrobustness, and resource efficiency - capabilities that biological systems\nhandle seamlessly. Specifically, ANNs often overlook the functional and\nmorphological diversity of the brain, hindering their computational\ncapabilities. Furthermore, incorporating cell-type specific neuromodulatory\neffects into ANNs with neuronal heterogeneity could enable learning at two\nspatial scales: spiking behavior at the neuronal level, and synaptic plasticity\nat the circuit level, thereby potentially enhancing their learning abilities.\nIn this article, we summarize recent bio-inspired models, learning rules and\narchitectures and propose a biologically-informed framework for enhancing ANNs.\nOur proposed dual-framework approach highlights the potential of spiking neural\nnetworks (SNNs) for emulating diverse spiking behaviors and dendritic\ncompartments to simulate morphological and functional diversity of neuronal\ncomputations. Finally, we outline how the proposed approach integrates\nbrain-inspired compartmental models and task-driven SNNs, balances\nbioinspiration and complexity, and provides scalable solutions for pressing AI\nchallenges, such as continual learning, adaptability, robustness, and\nresource-efficiency.\n","authors":["Alejandro Rodriguez-Garcia","Jie Mei","Srikanth Ramaswamy"],"pdf_url":"https://arxiv.org/pdf/2407.04525v1.pdf","comment":"34 pages, 4 figures, 3 boxes"},{"id":"http://arxiv.org/abs/2407.04522v1","updated":"2024-07-05T14:07:15Z","published":"2024-07-05T14:07:15Z","title":"Graph Reinforcement Learning in Power Grids: A Survey","summary":" The challenges posed by renewable energy and distributed electricity\ngeneration motivate the development of deep learning approaches to overcome the\nlack of flexibility of traditional methods in power grids use cases. The\napplication of GNNs is particularly promising due to their ability to learn\nfrom graph-structured data present in power grids. Combined with RL, they can\nserve as control approaches to determine remedial grid actions. This review\nanalyses the ability of GRL to capture the inherent graph structure of power\ngrids to improve representation learning and decision making in different power\ngrid use cases. It distinguishes between common problems in transmission and\ndistribution grids and explores the synergy between RL and GNNs. In\ntransmission grids, GRL typically addresses automated grid management and\ntopology control, whereas on the distribution side, GRL concentrates more on\nvoltage regulation. We analyzed the selected papers based on their graph\nstructure and GNN model, the applied RL algorithm, and their overall\ncontributions. Although GRL demonstrate adaptability in the face of\nunpredictable events and noisy or incomplete data, it primarily serves as a\nproof of concept at this stage. There are multiple open challenges and\nlimitations that need to be addressed when considering the application of RL to\nreal power grid operation.\n","authors":["Mohamed Hassouna","Clara Holzhüter","Pawel Lytaev","Josephine Thomas","Bernhard Sick","Christoph Scholz"],"pdf_url":"https://arxiv.org/pdf/2407.04522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04521v1","updated":"2024-07-05T14:06:59Z","published":"2024-07-05T14:06:59Z","title":"Unified continuous-time q-learning for mean-field game and mean-field\n control problems","summary":" This paper studies the continuous-time q-learning in the mean-field\njump-diffusion models from the representative agent's perspective. To overcome\nthe challenge when the population distribution may not be directly observable,\nwe introduce the integrated q-function in decoupled form (decoupled\nIq-function) and establish its martingale characterization together with the\nvalue function, which provides a unified policy evaluation rule for both\nmean-field game (MFG) and mean-field control (MFC) problems. Moreover,\ndepending on the task to solve the MFG or MFC problem, we can employ the\ndecoupled Iq-function by different means to learn the mean-field equilibrium\npolicy or the mean-field optimal policy respectively. As a result, we devise a\nunified q-learning algorithm for both MFG and MFC problems by utilizing all\ntest policies stemming from the mean-field interactions. For several examples\nin the jump-diffusion setting, within and beyond the LQ framework, we can\nobtain the exact parameterization of the decoupled Iq-functions and the value\nfunctions, and illustrate our algorithm from the representative agent's\nperspective with satisfactory performance.\n","authors":["Xiaoli Wei","Xiang Yu","Fengyi Yuan"],"pdf_url":"https://arxiv.org/pdf/2407.04521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05962v2","updated":"2024-07-05T14:01:11Z","published":"2024-05-09T17:58:25Z","title":"Age Aware Scheduling for Differentially-Private Federated Learning","summary":" This paper explores differentially-private federated learning (FL) across\ntime-varying databases, delving into a nuanced three-way tradeoff involving\nage, accuracy, and differential privacy (DP). Emphasizing the potential\nadvantages of scheduling, we propose an optimization problem aimed at meeting\nDP requirements while minimizing the loss difference between the aggregated\nmodel and the model obtained without DP constraints. To harness the benefits of\nscheduling, we introduce an age-dependent upper bound on the loss, leading to\nthe development of an age-aware scheduling design. Simulation results\nunderscore the superior performance of our proposed scheme compared to FL with\nclassic DP, which does not consider scheduling as a design factor. This\nresearch contributes insights into the interplay of age, accuracy, and DP in\nfederated learning, with practical implications for scheduling strategies.\n","authors":["Kuan-Yu Lin","Hsuan-Yin Lin","Yu-Pin Hsu","Yu-Chih Huang"],"pdf_url":"https://arxiv.org/pdf/2405.05962v2.pdf","comment":"Simulation parameters updated. Paper accepted for presentation at the\n 2024 IEEE International Symposium on Information Theory (ISIT 2024)"},{"id":"http://arxiv.org/abs/2407.04516v1","updated":"2024-07-05T13:57:35Z","published":"2024-07-05T13:57:35Z","title":"G-Adaptive mesh refinement -- leveraging graph neural networks and\n differentiable finite element solvers","summary":" We present a novel, and effective, approach to the long-standing problem of\nmesh adaptivity in finite element methods (FEM). FE solvers are powerful tools\nfor solving partial differential equations (PDEs), but their cost and accuracy\nare critically dependent on the choice of mesh points. To keep computational\ncosts low, mesh relocation (r-adaptivity) seeks to optimise the position of a\nfixed number of mesh points to obtain the best FE solution accuracy. Classical\napproaches to this problem require the solution of a separate nonlinear\n\"meshing\" PDE to find the mesh point locations. This incurs significant cost at\nremeshing and relies on certain a-priori assumptions and guiding heuristics for\noptimal mesh point location. Recent machine learning approaches to r-adaptivity\nhave mainly focused on the construction of fast surrogates for such classical\nmethods. Our new approach combines a graph neural network (GNN) powered\narchitecture, with training based on direct minimisation of the FE solution\nerror with respect to the mesh point locations. The GNN employs graph neural\ndiffusion (GRAND), closely aligning the mesh solution space to that of\nclassical meshing methodologies, thus replacing heuristics with a learnable\nstrategy, and providing a strong inductive bias. This allows for rapid and\nrobust training and results in an extremely efficient and effective GNN\napproach to online r-adaptivity. This method outperforms classical and prior ML\napproaches to r-adaptive meshing on the test problems we consider, in\nparticular achieving lower FE solution error, whilst retaining the significant\nspeed-up over classical methods observed in prior ML work.\n","authors":["James Rowbottom","Georg Maierhofer","Teo Deveney","Katharina Schratz","Pietro Liò","Carola-Bibiane Schönlieb","Chris Budd"],"pdf_url":"https://arxiv.org/pdf/2407.04516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11461v2","updated":"2024-07-05T13:57:34Z","published":"2023-01-26T23:15:51Z","title":"Learning to Generate All Feasible Actions","summary":" Modern cyber-physical systems are becoming increasingly complex to model,\nthus motivating data-driven techniques such as reinforcement learning (RL) to\nfind appropriate control agents. However, most systems are subject to hard\nconstraints such as safety or operational bounds. Typically, to learn to\nsatisfy these constraints, the agent must violate them systematically, which is\ncomputationally prohibitive in most systems. Recent efforts aim to utilize\nfeasibility models that assess whether a proposed action is feasible to avoid\napplying the agent's infeasible action proposals to the system. However, these\nefforts focus on guaranteeing constraint satisfaction rather than the agent's\nlearning efficiency. To improve the learning process, we introduce action\nmapping, a novel approach that divides the learning process into two steps:\nfirst learn feasibility and subsequently, the objective by mapping actions into\nthe sets of feasible actions. This paper focuses on the feasibility part by\nlearning to generate all feasible actions through self-supervised querying of\nthe feasibility model. We train the agent by formulating the problem as a\ndistribution matching problem and deriving gradient estimators for different\ndivergences. Through an illustrative example, a robotic path planning scenario,\nand a robotic grasping simulation, we demonstrate the agent's proficiency in\ngenerating actions across disconnected feasible action sets. By addressing the\nfeasibility step, this paper makes it possible to focus future work on the\nobjective part of action mapping, paving the way for an RL framework that is\nboth safe and efficient.\n","authors":["Mirco Theile","Daniele Bernardini","Raphael Trumpp","Cristina Piazza","Marco Caccamo","Alberto L. Sangiovanni-Vincentelli"],"pdf_url":"https://arxiv.org/pdf/2301.11461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04513v1","updated":"2024-07-05T13:54:15Z","published":"2024-07-05T13:54:15Z","title":"LayerShuffle: Enhancing Robustness in Vision Transformers by Randomizing\n Layer Execution Order","summary":" Due to their architecture and how they are trained, artificial neural\nnetworks are typically not robust toward pruning, replacing, or shuffling\nlayers at test time. However, such properties would be desirable for different\napplications, such as distributed neural network architectures where the order\nof execution cannot be guaranteed or parts of the network can fail during\ninference. In this work, we address these issues through a number of proposed\ntraining approaches for vision transformers whose most important component is\nrandomizing the execution order of attention modules at training time. We show\nthat with our proposed approaches, vision transformers are indeed capable to\nadapt to arbitrary layer execution orders at test time assuming one tolerates a\nreduction (about 20\\%) in accuracy at the same model size. We also find that\nour trained models can be randomly merged with each other resulting in\nfunctional (\"Frankenstein\") models without loss of performance compared to the\nsource models. Finally, we layer-prune our models at test time and find that\ntheir performance declines gracefully.\n","authors":["Matthias Freiberger","Peter Kun","Anders Sundnes Løvlie","Sebastian Risi"],"pdf_url":"https://arxiv.org/pdf/2407.04513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03229v2","updated":"2024-07-05T13:51:29Z","published":"2023-09-04T15:13:56Z","title":"Which algorithm to select in sports timetabling?","summary":" Any sports competition needs a timetable, specifying when and where teams\nmeet each other. The recent International Timetabling Competition (ITC2021) on\nsports timetabling showed that, although it is possible to develop general\nalgorithms, the performance of each algorithm varies considerably over the\nproblem instances. This paper provides an instance space analysis for sports\ntimetabling, resulting in powerful insights into the strengths and weaknesses\nof eight state-of-the-art algorithms. Based on machine learning techniques, we\npropose an algorithm selection system that predicts which algorithm is likely\nto perform best when given the characteristics of a sports timetabling problem\ninstance. Furthermore, we identify which characteristics are important in\nmaking that prediction, providing insights in the performance of the\nalgorithms, and suggestions to further improve them. Finally, we assess the\nempirical hardness of the instances. Our results are based on large\ncomputational experiments involving about 50 years of CPU time on more than 500\nnewly generated problem instances.\n","authors":["David Van Bulck","Dries Goossens","Jan-Patrick Clarner","Angelos Dimitsas","George H. G. Fonseca","Carlos Lamas-Fernandez","Martin Mariusz Lester","Jaap Pedersen","Antony E. Phillips","Roberto Maria Rosati"],"pdf_url":"https://arxiv.org/pdf/2309.03229v2.pdf","comment":"This is the peer-reviewed author-version of\n https://doi.org/10.1016/j.ejor.2024.06.005, published in the European Journal\n of Operational Research. Copyright 2024. This manuscript version is made\n available under the LCC-BY-NC-ND 4.0 license\n (https://creativecommons.org/licenses/by-nc-nd/4.0/)"},{"id":"http://arxiv.org/abs/2407.04507v1","updated":"2024-07-05T13:46:11Z","published":"2024-07-05T13:46:11Z","title":"Few-Shot Airway-Tree Modeling using Data-Driven Sparse Priors","summary":" The lack of large annotated datasets in medical imaging is an intrinsic\nburden for supervised Deep Learning (DL) segmentation models. Few-shot learning\napproaches are cost-effective solutions to transfer pre-trained models using\nonly limited annotated data. However, such methods can be prone to overfitting\ndue to limited data diversity especially when segmenting complex, diverse, and\nsparse tubular structures like airways. Furthermore, crafting informative image\nrepresentations has played a crucial role in medical imaging, enabling\ndiscriminative enhancement of anatomical details. In this paper, we initially\ntrain a data-driven sparsification module to enhance airways efficiently in\nlung CT scans. We then incorporate these sparse representations in a standard\nsupervised segmentation pipeline as a pretraining step to enhance the\nperformance of the DL models. Results presented on the ATM public challenge\ncohort show the effectiveness of using sparse priors in pre-training, leading\nto segmentation Dice score increase by 1% to 10% in full-scale and few-shot\nlearning scenarios, respectively.\n","authors":["Ali Keshavarzi","Elsa Angelini"],"pdf_url":"https://arxiv.org/pdf/2407.04507v1.pdf","comment":"Accepted at 21st IEEE International Symposium on Biomedical Imaging\n (ISBI)"},{"id":"http://arxiv.org/abs/2406.14380v3","updated":"2024-07-05T13:40:48Z","published":"2024-06-20T14:53:26Z","title":"Estimating Treatment Effects under Recommender Interference: A\n Structured Neural Networks Approach","summary":" Recommender systems are essential for content-sharing platforms by curating\npersonalized content. To evaluate updates to recommender systems targeting\ncontent creators, platforms frequently rely on creator-side randomized\nexperiments. The treatment effect measures the change in outcomes when a new\nalgorithm is implemented compared to the status quo. We show that the standard\ndifference-in-means estimator can lead to biased estimates due to recommender\ninterference that arises when treated and control creators compete for\nexposure. We propose a \"recommender choice model\" that describes which item\ngets exposed from a pool containing both treated and control items. By\ncombining a structural choice model with neural networks, this framework\ndirectly models the interference pathway while accounting for rich\nviewer-content heterogeneity. We construct a debiased estimator of the\ntreatment effect and prove it is $\\sqrt n$-consistent and asymptotically normal\nwith potentially correlated samples. We validate our estimator's empirical\nperformance with a field experiment on Weixin short-video platform. In addition\nto the standard creator-side experiment, we conduct a costly double-sided\nrandomization design to obtain a benchmark estimate free from interference\nbias. We show that the proposed estimator yields results comparable to the\nbenchmark, whereas the standard difference-in-means estimator can exhibit\nsignificant bias and even produce reversed signs.\n","authors":["Ruohan Zhan","Shichao Han","Yuchen Hu","Zhenling Jiang"],"pdf_url":"https://arxiv.org/pdf/2406.14380v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09633v9","updated":"2024-07-05T13:35:54Z","published":"2023-06-16T05:32:24Z","title":"The False Dawn: Reevaluating Google's Reinforcement Learning for Chip\n Macro Placement","summary":" Reinforcement learning (RL) for physical design of silicon chips in a Google\n2021 Nature paper stirred controversy due to poorly documented claims that\nraised eyebrows and drew critical media coverage. The paper withheld critical\nmethodology steps and most inputs needed to reproduce results. Our\nmeta-analysis shows how two separate evaluations filled in the gaps and\ndemonstrated that Google RL lags behind (i) human designers, (ii) a well-known\nalgorithm (Simulated Annealing), and (iii) generally-available commercial\nsoftware, while being slower; and in a 2023 open research contest, RL methods\nweren't in top 5. Crosschecked data indicate that the integrity of the Nature\npaper is substantially undermined owing to errors in conduct, analysis and\nreporting. Before publishing, Google rebuffed internal allegations of fraud,\nwhich still stand. We note policy implications and conclusions for chip design.\n","authors":["Igor L. Markov"],"pdf_url":"https://arxiv.org/pdf/2306.09633v9.pdf","comment":"15 pages, 1 figure, 4 tables, 83 references"},{"id":"http://arxiv.org/abs/2407.04495v1","updated":"2024-07-05T13:35:14Z","published":"2024-07-05T13:35:14Z","title":"Speed-accuracy trade-off for the diffusion models: Wisdom from\n nonequlibrium thermodynamics and optimal transport","summary":" We discuss a connection between a generative model, called the diffusion\nmodel, and nonequilibrium thermodynamics for the Fokker-Planck equation, called\nstochastic thermodynamics. Based on the techniques of stochastic\nthermodynamics, we derive the speed-accuracy trade-off for the diffusion\nmodels, which is a trade-off relationship between the speed and accuracy of\ndata generation in diffusion models. Our result implies that the entropy\nproduction rate in the forward process affects the errors in data generation.\nFrom a stochastic thermodynamic perspective, our results provide quantitative\ninsight into how best to generate data in diffusion models. The optimal\nlearning protocol is introduced by the conservative force in stochastic\nthermodynamics and the geodesic of space by the 2-Wasserstein distance in\noptimal transport theory. We numerically illustrate the validity of the\nspeed-accuracy trade-off for the diffusion models with different noise\nschedules such as the cosine schedule, the conditional optimal transport, and\nthe optimal transport.\n","authors":["Kotaro Ikeda","Tomoya Uda","Daisuke Okanohara","Sosuke Ito"],"pdf_url":"https://arxiv.org/pdf/2407.04495v1.pdf","comment":"26 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.04493v1","updated":"2024-07-05T13:32:06Z","published":"2024-07-05T13:32:06Z","title":"PROUD: PaRetO-gUided Diffusion Model for Multi-objective Generation","summary":" Recent advancements in the realm of deep generative models focus on\ngenerating samples that satisfy multiple desired properties. However, prevalent\napproaches optimize these property functions independently, thus omitting the\ntrade-offs among them. In addition, the property optimization is often\nimproperly integrated into the generative models, resulting in an unnecessary\ncompromise on generation quality (i.e., the quality of generated samples). To\naddress these issues, we formulate a constrained optimization problem. It seeks\nto optimize generation quality while ensuring that generated samples reside at\nthe Pareto front of multiple property objectives. Such a formulation enables\nthe generation of samples that cannot be further improved simultaneously on the\nconflicting property functions and preserves good quality of generated samples.\nBuilding upon this formulation, we introduce the PaRetO-gUided Diffusion model\n(PROUD), wherein the gradients in the denoising process are dynamically\nadjusted to enhance generation quality while the generated samples adhere to\nPareto optimality. Experimental evaluations on image generation and protein\ngeneration tasks demonstrate that our PROUD consistently maintains superior\ngeneration quality while approaching Pareto optimality across multiple property\nfunctions compared to various baselines.\n","authors":["Yinghua Yao","Yuangang Pan","Jing Li","Ivor Tsang","Xin Yao"],"pdf_url":"https://arxiv.org/pdf/2407.04493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04491v1","updated":"2024-07-05T13:29:30Z","published":"2024-07-05T13:29:30Z","title":"Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular\n Data","summary":" For classification and regression on tabular data, the dominance of\ngradient-boosted decision trees (GBDTs) has recently been challenged by often\nmuch slower deep learning methods with extensive hyperparameter tuning. We\naddress this discrepancy by introducing (a) RealMLP, an improved multilayer\nperceptron (MLP), and (b) improved default parameters for GBDTs and RealMLP. We\ntune RealMLP and the default parameters on a meta-train benchmark with 71\nclassification and 47 regression datasets and compare them to\nhyperparameter-optimized versions on a disjoint meta-test benchmark with 48\nclassification and 42 regression datasets, as well as the GBDT-friendly\nbenchmark by Grinsztajn et al. (2022). Our benchmark results show that RealMLP\noffers a better time-accuracy tradeoff than other neural nets and is\ncompetitive with GBDTs. Moreover, a combination of RealMLP and GBDTs with\nimproved default parameters can achieve excellent results on medium-sized\ntabular datasets (1K--500K samples) without hyperparameter tuning.\n","authors":["David Holzmüller","Léo Grinsztajn","Ingo Steinwart"],"pdf_url":"https://arxiv.org/pdf/2407.04491v1.pdf","comment":"10 pages + 44 pages appendix. Code is available at\n github.com/dholzmueller/pytabkit and\n github.com/LeoGrin/tabular-benchmark/tree/better_by_default"},{"id":"http://arxiv.org/abs/2407.04485v1","updated":"2024-07-05T13:08:58Z","published":"2024-07-05T13:08:58Z","title":"Leveraging Graph Structures to Detect Hallucinations in Large Language\n Models","summary":" Large language models are extensively applied across a wide range of tasks,\nsuch as customer support, content creation, educational tutoring, and providing\nfinancial guidance. However, a well-known drawback is their predisposition to\ngenerate hallucinations. This damages the trustworthiness of the information\nthese models provide, impacting decision-making and user confidence. We propose\na method to detect hallucinations by looking at the structure of the latent\nspace and finding associations within hallucinated and non-hallucinated\ngenerations. We create a graph structure that connects generations that lie\nclosely in the embedding space. Moreover, we employ a Graph Attention Network\nwhich utilizes message passing to aggregate information from neighboring nodes\nand assigns varying degrees of importance to each neighbor based on their\nrelevance. Our findings show that 1) there exists a structure in the latent\nspace that differentiates between hallucinated and non-hallucinated\ngenerations, 2) Graph Attention Networks can learn this structure and\ngeneralize it to unseen generations, and 3) the robustness of our method is\nenhanced when incorporating contrastive learning. When evaluated against\nevidence-based benchmarks, our model performs similarly without access to\nsearch-based methods.\n","authors":["Noa Nonkes","Sergei Agaronian","Evangelos Kanoulas","Roxana Petcu"],"pdf_url":"https://arxiv.org/pdf/2407.04485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02964v2","updated":"2024-07-05T13:04:10Z","published":"2024-02-05T12:42:21Z","title":"Mixed Noise and Posterior Estimation with Conditional DeepGEM","summary":" Motivated by indirect measurements and applications from nanometrology with a\nmixed noise model, we develop a novel algorithm for jointly estimating the\nposterior and the noise parameters in Bayesian inverse problems. We propose to\nsolve the problem by an expectation maximization (EM) algorithm. Based on the\ncurrent noise parameters, we learn in the E-step a conditional normalizing flow\nthat approximates the posterior. In the M-step, we propose to find the noise\nparameter updates again by an EM algorithm, which has analytical formulas. We\ncompare the training of the conditional normalizing flow with the forward and\nreverse KL, and show that our model is able to incorporate information from\nmany measurements, unlike previous approaches.\n","authors":["Paul Hagemann","Johannes Hertrich","Maren Casfor","Sebastian Heidenreich","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2402.02964v2.pdf","comment":"Published in Machine Learning: Science and Technology"},{"id":"http://arxiv.org/abs/2407.04481v1","updated":"2024-07-05T13:04:06Z","published":"2024-07-05T13:04:06Z","title":"Using Petri Nets as an Integrated Constraint Mechanism for Reinforcement\n Learning Tasks","summary":" The lack of trust in algorithms is usually an issue when using Reinforcement\nLearning (RL) agents for control in real-world domains such as production\nplants, autonomous vehicles, or traffic-related infrastructure, partly due to\nthe lack of verifiability of the model itself. In such scenarios, Petri nets\n(PNs) are often available for flowcharts or process steps, as they are\nversatile and standardized. In order to facilitate integration of RL models and\nas a step towards increasing AI trustworthiness, we propose an approach that\nuses PNs with three main advantages over typical RL approaches: Firstly, the\nagent can now easily be modeled with a combined state including both external\nenvironmental observations and agent-specific state information from a given\nPN. Secondly, we can enforce constraints for state-dependent actions through\nthe inherent PN model. And lastly, we can increase trustworthiness by verifying\nPN properties through techniques such as model checking. We test our approach\non a typical four-way intersection traffic light control setting and present\nour results, beating cycle-based baselines.\n","authors":["Timon Sachweh","Pierre Haritz","Thomas Liebig"],"pdf_url":"https://arxiv.org/pdf/2407.04481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04480v1","updated":"2024-07-05T13:01:36Z","published":"2024-07-05T13:01:36Z","title":"LoCo: Low-Bit Communication Adaptor for Large-scale Model Training","summary":" To efficiently train large-scale models, low-bit gradient communication\ncompresses full-precision gradients on local GPU nodes into low-precision ones\nfor higher gradient synchronization efficiency among GPU nodes. However, it\noften degrades training quality due to compression information loss. To address\nthis, we propose the Low-bit Communication Adaptor (LoCo), which compensates\ngradients on local GPU nodes before compression, ensuring efficient\nsynchronization without compromising training quality. Specifically, LoCo\ndesigns a moving average of historical compensation errors to stably estimate\nconcurrent compression error and then adopts it to compensate for the\nconcurrent gradient compression, yielding a less lossless compression. This\nmechanism allows it to be compatible with general optimizers like Adam and\nsharding strategies like FSDP. Theoretical analysis shows that integrating LoCo\ninto full-precision optimizers like Adam and SGD does not impair their\nconvergence speed on nonconvex problems. Experimental results show that across\nlarge-scale model training frameworks like Megatron-LM and PyTorch's FSDP, LoCo\nsignificantly improves communication efficiency, e.g., improving Adam's\ntraining speed by 14% to 40% without performance degradation on large language\nmodels like LLAMAs and MoE.\n","authors":["Xingyu Xie","Zhijie Lin","Kim-Chuan Toh","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.04480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04476v1","updated":"2024-07-05T12:53:34Z","published":"2024-07-05T12:53:34Z","title":"Rethinking Data Input for Point Cloud Upsampling","summary":" In recent years, point cloud upsampling has been widely applied in fields\nsuch as 3D reconstruction and surface generation. However, existing point cloud\nupsampling inputs are all patch based, and there is no research discussing the\ndifferences and principles between point cloud model full input and patch based\ninput. In order to compare with patch based point cloud input, this article\nproposes a new data input method, which divides the full point cloud model to\nensure shape integrity while training PU-GCN. This article was validated on the\nPU1K and ABC datasets, but the results showed that Patch based performance is\nbetter than model based full input i.e. Average Segment input. Therefore, this\narticle explores the data input factors and model modules that affect the\nupsampling results of point clouds.\n","authors":["Tongxu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04476v1.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.04472v1","updated":"2024-07-05T12:42:31Z","published":"2024-07-05T12:42:31Z","title":"EventChat: Implementation and user-centric evaluation of a large\n language model-driven conversational recommender system for exploring leisure\n events in an SME context","summary":" Large language models (LLMs) present an enormous evolution in the strategic\npotential of conversational recommender systems (CRS). Yet to date, research\nhas predominantly focused upon technical frameworks to implement LLM-driven\nCRS, rather than end-user evaluations or strategic implications for firms,\nparticularly from the perspective of a small to medium enterprises (SME) that\nmakeup the bedrock of the global economy. In the current paper, we detail the\ndesign of an LLM-driven CRS in an SME setting, and its subsequent performance\nin the field using both objective system metrics and subjective user\nevaluations. While doing so, we additionally outline a short-form revised\nResQue model for evaluating LLM-driven CRS, enabling replicability in a rapidly\nevolving field. Our results reveal good system performance from a user\nexperience perspective (85.5% recommendation accuracy) but underscore latency,\ncost, and quality issues challenging business viability. Notably, with a median\ncost of $0.04 per interaction and a latency of 5.7s, cost-effectiveness and\nresponse time emerge as crucial areas for achieving a more user-friendly and\neconomically viable LLM-driven CRS for SME settings. One major driver of these\ncosts is the use of an advanced LLM as a ranker within the retrieval-augmented\ngeneration (RAG) technique. Our results additionally indicate that relying\nsolely on approaches such as Prompt-based learning with ChatGPT as the\nunderlying LLM makes it challenging to achieve satisfying quality in a\nproduction environment. Strategic considerations for SMEs deploying an\nLLM-driven CRS are outlined, particularly considering trade-offs in the current\ntechnical landscape.\n","authors":["Hannes Kunstmann","Joseph Ollier","Joel Persson","Florian von Wangenheim"],"pdf_url":"https://arxiv.org/pdf/2407.04472v1.pdf","comment":"27 pages, 3 tables, 5 figures, pre-print manuscript"},{"id":"http://arxiv.org/abs/2407.02844v2","updated":"2024-07-05T12:37:15Z","published":"2024-07-03T06:40:26Z","title":"Multi-Attention Integrated Deep Learning Frameworks for Enhanced Breast\n Cancer Segmentation and Identification","summary":" Breast cancer poses a profound threat to lives globally, claiming numerous\nlives each year. Therefore, timely detection is crucial for early intervention\nand improved chances of survival. Accurately diagnosing and classifying breast\ntumors using ultrasound images is a persistent challenge in medicine, demanding\ncutting-edge solutions for improved treatment strategies. This research\nintroduces multiattention-enhanced deep learning (DL) frameworks designed for\nthe classification and segmentation of breast cancer tumors from ultrasound\nimages. A spatial channel attention mechanism is proposed for segmenting tumors\nfrom ultrasound images, utilizing a novel LinkNet DL framework with an\nInceptionResNet backbone. Following this, the paper proposes a deep\nconvolutional neural network with an integrated multi-attention framework\n(DCNNIMAF) to classify the segmented tumor as benign, malignant, or normal.\nFrom experimental results, it is observed that the segmentation model has\nrecorded an accuracy of 98.1%, with a minimal loss of 0.6%. It has also\nachieved high Intersection over Union (IoU) and Dice Coefficient scores of\n96.9% and 97.2%, respectively. Similarly, the classification model has attained\nan accuracy of 99.2%, with a low loss of 0.31%. Furthermore, the classification\nframework has achieved outstanding F1-Score, precision, and recall values of\n99.1%, 99.3%, and 99.1%, respectively. By offering a robust framework for early\ndetection and accurate classification of breast cancer, this proposed work\nsignificantly advances the field of medical image analysis, potentially\nimproving diagnostic precision and patient outcomes.\n","authors":["Pandiyaraju V","Shravan Venkatraman","Pavan Kumar S","Santhosh Malarvannan","Kannan A"],"pdf_url":"https://arxiv.org/pdf/2407.02844v2.pdf","comment":"29 pages, 15 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.06198v2","updated":"2024-07-05T12:34:41Z","published":"2024-04-09T10:41:59Z","title":"The impact of data set similarity and diversity on transfer learning\n success in time series forecasting","summary":" Pre-trained models have become pivotal in enhancing the efficiency and\naccuracy of time series forecasting on target data sets by leveraging transfer\nlearning. While benchmarks validate the performance of model generalization on\nvarious target data sets, there is no structured research providing similarity\nand diversity measures to explain which characteristics of source and target\ndata lead to transfer learning success. Our study pioneers in systematically\nevaluating the impact of source-target similarity and source diversity on\nzero-shot and fine-tuned forecasting outcomes in terms of accuracy, bias, and\nuncertainty estimation. We investigate these dynamics using pre-trained neural\nnetworks across five public source datasets, applied to forecasting five target\ndata sets, including real-world wholesales data. We identify two feature-based\nsimilarity and diversity measures, finding that source-target similarity\nreduces forecasting bias, while source diversity improves forecasting accuracy\nand uncertainty estimation, but increases the bias.\n","authors":["Claudia Ehrig","Benedikt Sonnleitner","Ursula Neumann","Catherine Cleophas","Germain Forestier"],"pdf_url":"https://arxiv.org/pdf/2404.06198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18839v6","updated":"2024-07-05T12:19:09Z","published":"2024-02-29T04:12:32Z","title":"Extended Flow Matching: a Method of Conditional Generation with\n Generalized Continuity Equation","summary":" The task of conditional generation is one of the most important applications\nof generative models, and numerous methods have been developed to date based on\nthe celebrated flow-based models. However, many flow-based models in use today\nare not built to allow one to introduce an explicit inductive bias to how the\nconditional distribution to be generated changes with respect to conditions.\nThis can result in unexpected behavior in the task of style transfer, for\nexample. In this research, we introduce extended flow matching (EFM), a direct\nextension of flow matching that learns a \"matrix field\" corresponding to the\ncontinuous map from the space of conditions to the space of distributions. We\nshow that we can introduce inductive bias to the conditional generation through\nthe matrix field and demonstrate this fact with MMOT-EFM, a version of EFM that\naims to minimize the Dirichlet energy or the sensitivity of the distribution\nwith respect to conditions. We will present our theory along with experimental\nresults that support the competitiveness of EFM in conditional generation.\n","authors":["Noboru Isobe","Masanori Koyama","Jinzhe Zhang","Kohei Hayashi","Kenji Fukumizu"],"pdf_url":"https://arxiv.org/pdf/2402.18839v6.pdf","comment":"27 pages, 10 figures, We have corrected an error in our experiment on\n COT-FM"},{"id":"http://arxiv.org/abs/2407.04460v1","updated":"2024-07-05T12:10:54Z","published":"2024-07-05T12:10:54Z","title":"Smart Sampling: Helping from Friendly Neighbors for Decentralized\n Federated Learning","summary":" Federated Learning (FL) is gaining widespread interest for its ability to\nshare knowledge while preserving privacy and reducing communication costs.\nUnlike Centralized FL, Decentralized FL (DFL) employs a network architecture\nthat eliminates the need for a central server, allowing direct communication\namong clients and leading to significant communication resource savings.\nHowever, due to data heterogeneity, not all neighboring nodes contribute to\nenhancing the local client's model performance. In this work, we introduce\n\\textbf{\\emph{AFIND+}}, a simple yet efficient algorithm for sampling and\naggregating neighbors in DFL, with the aim of leveraging collaboration to\nimprove clients' model performance. AFIND+ identifies helpful neighbors,\nadaptively adjusts the number of selected neighbors, and strategically\naggregates the sampled neighbors' models based on their contributions.\nNumerical results on real-world datasets with diverse data partitions\ndemonstrate that AFIND+ outperforms other sampling algorithms in DFL and is\ncompatible with most existing DFL optimization algorithms.\n","authors":["Lin Wang","Yang Chen","Yongxin Guo","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2407.04460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04451v1","updated":"2024-07-05T12:05:37Z","published":"2024-07-05T12:05:37Z","title":"Hindsight Preference Learning for Offline Preference-based Reinforcement\n Learning","summary":" Offline preference-based reinforcement learning (RL), which focuses on\noptimizing policies using human preferences between pairs of trajectory\nsegments selected from an offline dataset, has emerged as a practical avenue\nfor RL applications. Existing works rely on extracting step-wise reward signals\nfrom trajectory-wise preference annotations, assuming that preferences\ncorrelate with the cumulative Markovian rewards. However, such methods fail to\ncapture the holistic perspective of data annotation: Humans often assess the\ndesirability of a sequence of actions by considering the overall outcome rather\nthan the immediate rewards. To address this challenge, we propose to model\nhuman preferences using rewards conditioned on future outcomes of the\ntrajectory segments, i.e. the hindsight information. For downstream RL\noptimization, the reward of each step is calculated by marginalizing over\npossible future outcomes, the distribution of which is approximated by a\nvariational auto-encoder trained using the offline dataset. Our proposed\nmethod, Hindsight Preference Learning (HPL), can facilitate credit assignment\nby taking full advantage of vast trajectory data available in massive unlabeled\ndatasets. Comprehensive empirical studies demonstrate the benefits of HPL in\ndelivering robust and advantageous rewards across various domains. Our code is\npublicly released at https://github.com/typoverflow/WiseRL.\n","authors":["Chen-Xiao Gao","Shengjun Fang","Chenjun Xiao","Yang Yu","Zongzhang Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04449v1","updated":"2024-07-05T12:04:12Z","published":"2024-07-05T12:04:12Z","title":"Multi-modal Masked Siamese Network Improves Chest X-Ray Representation\n Learning","summary":" Self-supervised learning methods for medical images primarily rely on the\nimaging modality during pretraining. While such approaches deliver promising\nresults, they do not leverage associated patient or scan information collected\nwithin Electronic Health Records (EHR). Here, we propose to incorporate EHR\ndata during self-supervised pretraining with a Masked Siamese Network (MSN) to\nenhance the quality of chest X-ray representations. We investigate three types\nof EHR data, including demographic, scan metadata, and inpatient stay\ninformation. We evaluate our approach on three publicly available chest X-ray\ndatasets, MIMIC-CXR, CheXpert, and NIH-14, using two vision transformer (ViT)\nbackbones, specifically ViT-Tiny and ViT-Small. In assessing the quality of the\nrepresentations via linear evaluation, our proposed method demonstrates\nsignificant improvement compared to vanilla MSN and state-of-the-art\nself-supervised learning baselines. Our work highlights the potential of\nEHR-enhanced self-supervised pre-training for medical imaging. The code is\npublicly available at: https://github.com/nyuad-cai/CXR-EHR-MSN\n","authors":["Saeed Shurrab","Alejandro Guerra-Manzanares","Farah E. Shamout"],"pdf_url":"https://arxiv.org/pdf/2407.04449v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.04440v1","updated":"2024-07-05T11:42:39Z","published":"2024-07-05T11:42:39Z","title":"Wavelet-based Temporal Attention Improves Traffic Forecasting","summary":" Spatio-temporal forecasting of traffic flow data represents a typical problem\nin the field of machine learning, impacting urban traffic management systems.\nTraditional statistical and machine learning methods cannot adequately handle\nboth the temporal and spatial dependencies in these complex traffic flow\ndatasets. A prevalent approach in the field is to combine graph convolutional\nnetworks and multi-head attention mechanisms for spatio-temporal processing.\nThis paper proposes a wavelet-based temporal attention model, namely a\nwavelet-based dynamic spatio-temporal aware graph neural network (W-DSTAGNN),\nfor tackling the traffic forecasting problem. Benchmark experiments using\nseveral statistical metrics confirm that our proposal efficiently captures\nspatio-temporal correlations and outperforms ten state-of-the-art models on\nthree different real-world traffic datasets. Our proposed ensemble data-driven\nmethod can handle dynamic temporal and spatial dependencies and make long-term\nforecasts in an efficient manner.\n","authors":["Yash Jakhmola","Nitish Kumar Mishra","Kripabandhu Ghosh","Tanujit Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2407.04440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13595v3","updated":"2024-07-05T11:34:10Z","published":"2023-06-23T16:35:00Z","title":"Autoencoders for Real-Time SUEP Detection","summary":" Confining dark sectors with pseudo-conformal dynamics can produce Soft\nUnclustered Energy Patterns (SUEP), at the Large Hadron Collider: the\nproduction of dark quarks in proton-proton collisions leading to a dark shower\nand the high-multiplicity production of dark hadrons. The final experimental\nsignature is spherically-symmetric energy deposits by an anomalously large\nnumber of soft Standard Model particles with a transverse energy of O(100) MeV.\nAssuming Yukawa-like couplings of the scalar portal state, the dominant\nproduction mode is gluon fusion, and the dominant background comes from\nmulti-jet QCD events. We have developed a deep learning-based Anomaly Detection\ntechnique to reject QCD jets and identify any anomalous signature, including\nSUEP, in real-time in the High-Level Trigger system of the Compact Muon\nSolenoid experiment at the Large Hadron Collider. A deep convolutional neural\nautoencoder network has been trained using QCD events by taking transverse\nenergy deposits in the inner tracker, electromagnetic calorimeter, and hadron\ncalorimeter sub-detectors as 3-channel image data. Due to the sparse nature of\nthe data, only ~0.5% of the total ~300 k image pixels have non-zero values. To\ntackle this challenge, a non-standard loss function, the inverse of the\nso-called Dice Loss, is exploited. The trained autoencoder with learned spatial\nfeatures of QCD jets can detect 40% of the SUEP events, with a QCD event\nmistagging rate as low as 2%. The model inference time has been measured using\nthe Intel CoreTM i5-9600KF processor and found to be ~20 ms, which perfectly\nsatisfies the High-Level Trigger system's latency of O(100) ms. Given the\nvirtue of the unsupervised learning of the autoencoders, the trained model can\nbe applied to any new physics model that predicts an experimental signature\nanomalous to QCD jets.\n","authors":["Simranjit Singh Chhibra","Nadezda Chernyavskaya","Benedikt Maier","Maurzio Pierini","Syed Hasan"],"pdf_url":"https://arxiv.org/pdf/2306.13595v3.pdf","comment":"9 pages, 9 figures, 1 table, 1 equation"},{"id":"http://arxiv.org/abs/2402.11997v2","updated":"2024-07-05T11:26:51Z","published":"2024-02-19T09:43:03Z","title":"Remember This Event That Year? Assessing Temporal Information and\n Reasoning in Large Language Models","summary":" Large Language Models (LLMs) are increasingly ubiquitous, yet their ability\nto retain and reason about temporal information remains limited, hindering\ntheir application in real-world scenarios where understanding the sequential\nnature of events is crucial. Our study experiments with 12 state-of-the-art\nmodels (ranging from 2B to 70B+ parameters) on a novel numerical-temporal\ndataset, \\textbf{TempUN}, spanning from 10,000 BCE to 2100 CE, to uncover\nsignificant temporal retention and comprehension limitations. We propose six\nmetrics to assess three learning paradigms to enhance temporal knowledge\nacquisition. Our findings reveal that open-source models exhibit knowledge gaps\nmore frequently, suggesting a trade-off between limited knowledge and incorrect\nresponses. Additionally, various fine-tuning approaches significantly improved\nperformance, reducing incorrect outputs and impacting the identification of\n'information not available' in the generations. The associated dataset and code\nare available at (https://github.com/lingoiitgn/TempUN).\n","authors":["Himanshu Beniwal","Dishant Patel","Kowsik Nandagopan D","Hritik Ladia","Ankit Yadav","Mayank Singh"],"pdf_url":"https://arxiv.org/pdf/2402.11997v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04418v1","updated":"2024-07-05T11:09:05Z","published":"2024-07-05T11:09:05Z","title":"Enabling On-Device LLMs Personalization with Smartphone Sensing","summary":" This demo presents a novel end-to-end framework that combines on-device large\nlanguage models (LLMs) with smartphone sensing technologies to achieve\ncontext-aware and personalized services. The framework addresses critical\nlimitations of current personalization solutions via cloud-based LLMs, such as\nprivacy concerns, latency and cost, and limited personal sensor data. To\nachieve this, we innovatively proposed deploying LLMs on smartphones with\nmultimodal sensor data and customized prompt engineering, ensuring privacy and\nenhancing personalization performance through context-aware sensing. A case\nstudy involving a university student demonstrated the proposed framework's\ncapability to provide tailored recommendations. In addition, we show that the\nproposed framework achieves the best trade-off in privacy, performance,\nlatency, cost, battery and energy consumption between on-device and cloud LLMs.\nFuture work aims to integrate more diverse sensor data and conduct large-scale\nuser studies to further refine the personalization. We envision the proposed\nframework could significantly improve user experiences in various domains such\nas healthcare, productivity, and entertainment by providing secure,\ncontext-aware, and efficient interactions directly on users' devices.\n","authors":["Shiquan Zhang","Ying Ma","Le Fang","Hong Jia","Simon D'Alfonso","Vassilis Kostakos"],"pdf_url":"https://arxiv.org/pdf/2407.04418v1.pdf","comment":"5 pages, 3 figures, conference demo paper"},{"id":"http://arxiv.org/abs/2406.11909v2","updated":"2024-07-05T11:06:12Z","published":"2024-06-16T14:19:49Z","title":"Mixture-of-Subspaces in Low-Rank Adaptation","summary":" In this paper, we introduce a subspace-inspired Low-Rank Adaptation (LoRA)\nmethod, which is computationally efficient, easy to implement, and readily\napplicable to large language, multimodal, and diffusion models. Initially, we\nequivalently decompose the weights of LoRA into two subspaces, and find that\nsimply mixing them can enhance performance. To study such a phenomenon, we\nrevisit it through a fine-grained subspace lens, showing that such modification\nis equivalent to employing a fixed mixer to fuse the subspaces. To be more\nflexible, we jointly learn the mixer with the original LoRA weights, and term\nthe method Mixture-of-Subspaces LoRA (MoSLoRA). MoSLoRA consistently\noutperforms LoRA on tasks in different modalities, including commonsense\nreasoning, visual instruction tuning, and subject-driven text-to-image\ngeneration, demonstrating its effectiveness and robustness. Codes are available\nat https://github.com/wutaiqiang/MoSLoRA.\n","authors":["Taiqiang Wu","Jiahao Wang","Zhe Zhao","Ngai Wong"],"pdf_url":"https://arxiv.org/pdf/2406.11909v2.pdf","comment":"working in progress"},{"id":"http://arxiv.org/abs/2407.02984v2","updated":"2024-07-05T10:48:27Z","published":"2024-07-03T10:31:30Z","title":"Semantically Rich Local Dataset Generation for Explainable AI in\n Genomics","summary":" Black box deep learning models trained on genomic sequences excel at\npredicting the outcomes of different gene regulatory mechanisms. Therefore,\ninterpreting these models may provide novel insights into the underlying\nbiology, supporting downstream biomedical applications. Due to their\ncomplexity, interpretable surrogate models can only be built for local\nexplanations (e.g., a single instance). However, accomplishing this requires\ngenerating a dataset in the neighborhood of the input, which must maintain\nsyntactic similarity to the original data while introducing semantic\nvariability in the model's predictions. This task is challenging due to the\ncomplex sequence-to-function relationship of DNA.\n We propose using Genetic Programming to generate datasets by evolving\nperturbations in sequences that contribute to their semantic diversity. Our\ncustom, domain-guided individual representation effectively constrains\nsyntactic similarity, and we provide two alternative fitness functions that\npromote diversity with no computational effort. Applied to the RNA splicing\ndomain, our approach quickly achieves good diversity and significantly\noutperforms a random baseline in exploring the search space, as shown by our\nproof-of-concept, short RNA sequence. Furthermore, we assess its\ngeneralizability and demonstrate scalability to larger sequences, resulting in\na ~30% improvement over the baseline.\n","authors":["Pedro Barbosa","Rosina Savisaar","Alcides Fonseca"],"pdf_url":"https://arxiv.org/pdf/2407.02984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04407v1","updated":"2024-07-05T10:43:41Z","published":"2024-07-05T10:43:41Z","title":"Trustworthy Classification through Rank-Based Conformal Prediction Sets","summary":" Machine learning classification tasks often benefit from predicting a set of\npossible labels with confidence scores to capture uncertainty. However,\nexisting methods struggle with the high-dimensional nature of the data and the\nlack of well-calibrated probabilities from modern classification models. We\npropose a novel conformal prediction method that employs a rank-based score\nfunction suitable for classification models that predict the order of labels\ncorrectly, even if not well-calibrated. Our approach constructs prediction sets\nthat achieve the desired coverage rate while managing their size. We provide a\ntheoretical analysis of the expected size of the conformal prediction sets\nbased on the rank distribution of the underlying classifier. Through extensive\nexperiments, we demonstrate that our method outperforms existing techniques on\nvarious datasets, providing reliable uncertainty quantification. Our\ncontributions include a novel conformal prediction method, theoretical\nanalysis, and empirical evaluation. This work advances the practical deployment\nof machine learning systems by enabling reliable uncertainty quantification.\n","authors":["Rui Luo","Zhixin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.04407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04406v1","updated":"2024-07-05T10:43:24Z","published":"2024-07-05T10:43:24Z","title":"On Quantum Channel Learning","summary":" The problem of an optimal mapping between Hilbert spaces $IN$ and $OUT$,\nbased on a series of density matrix mapping measurements $\\rho^{(l)} \\to\n\\varrho^{(l)}$, $l=1\\dots M$, is formulated as an optimization problem\nmaximizing the total fidelity $\\mathcal{F}=\\sum_{l=1}^{M} \\omega^{(l)}\nF\\left(\\varrho^{(l)},\\sum_s B_s \\rho^{(l)} B^{\\dagger}_s\\right)$ subject to\nprobability preservation constraints on Kraus operators $B_s$. For\n$F(\\varrho,\\sigma)$ in the form that total fidelity can be represented as a\nquadratic form with superoperator $\\mathcal{F}=\\sum_s\\left\\langle\nB_s\\middle|S\\middle| B_s \\right\\rangle$ (either exactly or as an approximation)\nan iterative algorithm is developed to find the global maximum. The result\ncomprises in $N_s$ operators $B_s$ that collectively form an $IN$ to $OUT$\nquantum channel $A^{OUT}=\\sum_s B_s A^{IN} B_s^{\\dagger}$. The work introduces\ntwo important generalizations of unitary learning: 1. $IN$/$OUT$ states are\nrepresented as density matrices. 2. The mapping itself is formulated as a\ngeneral quantum channel. This marks a crucial advancement from the commonly\nstudied unitary mapping of pure states $\\phi_l=\\mathcal{U} \\psi_l$ to a general\nquantum channel, what allows us to distinguish probabilistic mixture of states\nand their superposition. An application of the approach is demonstrated on\nunitary learning of density matrix mapping $\\varrho^{(l)}=\\mathcal{U}\n\\rho^{(l)} \\mathcal{U}^{\\dagger}$, in this case a quadratic on $\\mathcal{U}$\nfidelity can be constructed by considering $\\sqrt{\\rho^{(l)}} \\to\n\\sqrt{\\varrho^{(l)}}$ mapping, and on a general quantum channel of Kraus rank\n$N_s$, where quadratic on $B_s$ fidelity is an approximation -- a quantum\nchannel is then built as a hierarchy of unitary mappings. The approach can be\napplied to study decoherence effects, spontaneous coherence, synchronizing,\netc.\n","authors":["Mikhail Gennadievich Belov","Victor Victorovich Dubov","Alexey Vladimirovich Filimonov","Vladislav Gennadievich Malyshkin"],"pdf_url":"https://arxiv.org/pdf/2407.04406v1.pdf","comment":"The unitary learning from arXiv:2405.10263 is generalized to density\n matrices and quantum channels"},{"id":"http://arxiv.org/abs/2407.04405v1","updated":"2024-07-05T10:41:15Z","published":"2024-07-05T10:41:15Z","title":"Discovering symbolic expressions with parallelized tree search","summary":" Symbolic regression plays a crucial role in modern scientific research thanks\nto its capability of discovering concise and interpretable mathematical\nexpressions from data. A grand challenge lies in the arduous search for\nparsimonious and generalizable mathematical formulas, in an infinite search\nspace, while intending to fit the training data. Existing algorithms have faced\na critical bottleneck of accuracy and efficiency over a decade when handling\nproblems of complexity, which essentially hinders the pace of applying symbolic\nregression for scientific exploration across interdisciplinary domains. To this\nend, we introduce a parallelized tree search (PTS) model to efficiently distill\ngeneric mathematical expressions from limited data. Through a series of\nextensive experiments, we demonstrate the superior accuracy and efficiency of\nPTS for equation discovery, which greatly outperforms the state-of-the-art\nbaseline models on over 80 synthetic and experimental datasets (e.g., lifting\nits performance by up to 99% accuracy improvement and one-order of magnitude\nspeed up). PTS represents a key advance in accurate and efficient data-driven\ndiscovery of symbolic, interpretable models (e.g., underlying physical laws)\nand marks a pivotal transition towards scalable symbolic learning.\n","authors":["Kai Ruan","Ze-Feng Gao","Yike Guo","Hao Sun","Ji-Rong Wen","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.04405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04400v1","updated":"2024-07-05T10:20:24Z","published":"2024-07-05T10:20:24Z","title":"Hard-Attention Gates with Gradient Routing for Endoscopic Image\n Computing","summary":" To address overfitting and enhance model generalization in\ngastroenterological polyp size assessment, our study introduces\nFeature-Selection Gates (FSG) or Hard-Attention Gates (HAG) alongside Gradient\nRouting (GR) for dynamic feature selection. This technique aims to boost\nConvolutional Neural Networks (CNNs) and Vision Transformers (ViTs) by\npromoting sparse connectivity, thereby reducing overfitting and enhancing\ngeneralization. HAG achieves this through sparsification with learnable\nweights, serving as a regularization strategy. GR further refines this process\nby optimizing HAG parameters via dual forward passes, independently from the\nmain model, to improve feature re-weighting. Our evaluation spanned multiple\ndatasets, including CIFAR-100 for a broad impact assessment and specialized\nendoscopic datasets (REAL-Colon, Misawa, and SUN) focusing on polyp size\nestimation, covering over 200 polyps in more than 370,000 frames. The findings\nindicate that our HAG-enhanced networks substantially enhance performance in\nboth binary and triclass classification tasks related to polyp sizing.\nSpecifically, CNNs experienced an F1 Score improvement to 87.8% in binary\nclassification, while in triclass classification, the ViT-T model reached an F1\nScore of 76.5%, outperforming traditional CNNs and ViT-T models. To facilitate\nfurther research, we are releasing our codebase, which includes implementations\nfor CNNs, multistream CNNs, ViT, and HAG-augmented variants. This resource aims\nto standardize the use of endoscopic datasets, providing public\ntraining-validation-testing splits for reliable and comparable research in\ngastroenterological polyp size estimation. The codebase is available at\ngithub.com/cosmoimd/feature-selection-gates.\n","authors":["Giorgio Roffo","Carlo Biffi","Pietro Salvagnini","Andrea Cherubini"],"pdf_url":"https://arxiv.org/pdf/2407.04400v1.pdf","comment":"Attention Gates, Hard-Attention Gates, Gradient Routing, Feature\n Selection Gates, Endoscopy, Medical Image Processing, Computer Vision"},{"id":"http://arxiv.org/abs/2401.10690v2","updated":"2024-07-05T10:19:36Z","published":"2024-01-19T13:41:08Z","title":"Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and\n unfairness in dyadic regression models","summary":" Dyadic regression models, which predict real-valued outcomes for pairs of\nentities, are fundamental in many domains (e.g. predicting the rating of a user\nto a product in Recommender Systems) and promising and under exploration in\nmany others (e.g. approximating the adequate dosage of a drug for a patient in\npersonalized pharmacology). In this work, we demonstrate that non-uniformity in\nthe observed value distributions of individual entities leads to severely\nbiased predictions in state-of-the-art models, skewing predictions towards the\naverage of observed past values for the entity and providing worse-than-random\npredictive power in eccentric yet equally important cases. We show that the\nusage of global error metrics like Root Mean Squared Error (RMSE) and Mean\nAbsolute Error (MAE) is insufficient to capture this phenomenon, which we name\neccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as\na new complementary metric that can quantify it in all studied models and\ndatasets. We also prove the adequateness of EAUC by using naive de-biasing\ncorrections to demonstrate that a lower model bias correlates with a lower EAUC\nand vice-versa. This work contributes a bias-aware evaluation of dyadic\nregression models to avoid potential unfairness and risks in critical\nreal-world applications of such systems.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Bertha Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2401.10690v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03342v3","updated":"2024-07-05T10:09:10Z","published":"2024-05-06T10:49:51Z","title":"Doubly Robust Causal Effect Estimation under Networked Interference via\n Targeted Learning","summary":" Causal effect estimation under networked interference is an important but\nchallenging problem. Available parametric methods are limited in their model\nspace, while previous semiparametric methods, e.g., leveraging neural networks\nto fit only one single nuisance function, may still encounter misspecification\nproblems under networked interference without appropriate assumptions on the\ndata generation process. To mitigate bias stemming from misspecification, we\npropose a novel doubly robust causal effect estimator under networked\ninterference, by adapting the targeted learning technique to the training of\nneural networks. Specifically, we generalize the targeted learning technique\ninto the networked interference setting and establish the condition under which\nan estimator achieves double robustness. Based on the condition, we devise an\nend-to-end causal effect estimator by transforming the identified theoretical\ncondition into a targeted loss. Moreover, we provide a theoretical analysis of\nour designed estimator, revealing a faster convergence rate compared to a\nsingle nuisance model. Extensive experimental results on two real-world\nnetworks with semisynthetic data demonstrate the effectiveness of our proposed\nestimators.\n","authors":["Weilin Chen","Ruichu Cai","Zeqin Yang","Jie Qiao","Yuguang Yan","Zijian Li","Zhifeng Hao"],"pdf_url":"https://arxiv.org/pdf/2405.03342v3.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2407.04393v1","updated":"2024-07-05T10:05:35Z","published":"2024-07-05T10:05:35Z","title":"Function Smoothing Regularization for Precision Factorization Machine\n Annealing in Continuous Variable Optimization Problems","summary":" Solving continuous variable optimization problems by factorization machine\nquantum annealing (FMQA) demonstrates the potential of Ising machines to be\nextended as a solver for integer and real optimization problems. However, the\ndetails of the Hamiltonian function surface obtained by factorization machine\n(FM) have been overlooked. This study shows that in the widely common case\nwhere real numbers are represented by a combination of binary variables, the\nfunction surface of the Hamiltonian obtained by FM can be very noisy. This\nnoise interferes with the inherent capabilities of quantum annealing and is\nlikely to be a substantial cause of problems previously considered unsolvable\ndue to the limitations of FMQA performance. The origin of the noise is\nidentified and a simple, general method is proposed to prevent its occurrence.\nThe generalization performance of the proposed method and its ability to solve\npractical problems is demonstrated.\n","authors":["Katsuhiro Endo","Kazuaki Z. Takahashi"],"pdf_url":"https://arxiv.org/pdf/2407.04393v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.06271v2","updated":"2024-07-05T10:03:40Z","published":"2024-02-09T09:37:28Z","title":"Adaptive proximal gradient methods are universal without approximation","summary":" We show that adaptive proximal gradient methods for convex problems are not\nrestricted to traditional Lipschitzian assumptions. Our analysis reveals that a\nclass of linesearch-free methods is still convergent under mere local H\\\"older\ngradient continuity, covering in particular continuously differentiable\nsemi-algebraic functions. To mitigate the lack of local Lipschitz continuity,\npopular approaches revolve around $\\varepsilon$-oracles and/or linesearch\nprocedures. In contrast, we exploit plain H\\\"older inequalities not entailing\nany approximation, all while retaining the linesearch-free nature of adaptive\nschemes. Furthermore, we prove full sequence convergence without prior\nknowledge of local H\\\"older constants nor of the order of H\\\"older continuity.\nNumerical experiments make comparisons with baseline methods on diverse tasks\nfrom machine learning covering both the locally and the globally H\\\"older\nsetting.\n","authors":["Konstantinos A. Oikonomidis","Emanuel Laude","Puya Latafat","Andreas Themelis","Panagiotis Patrinos"],"pdf_url":"https://arxiv.org/pdf/2402.06271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12434v2","updated":"2024-07-05T09:50:06Z","published":"2024-06-18T09:29:24Z","title":"Towards Audio Codec-based Speech Separation","summary":" Recent improvements in neural audio codec (NAC) models have generated\ninterest in adopting pre-trained codecs for a variety of speech processing\napplications to take advantage of the efficiencies gained from high\ncompression, but these have yet been applied to the speech separation (SS)\ntask. SS can benefit from high compression because the compute required for\ntraditional SS models makes them impractical for many edge computing use cases.\nHowever, SS is a waveform-masking task where compression tends to introduce\ndistortions that severely impact performance. Here we propose a novel task of\nAudio Codec-based SS, where SS is performed within the embedding space of a\nNAC, and propose a new model, Codecformer, to address this task. At inference,\nCodecformer achieves a 52x reduction in MAC while producing separation\nperformance comparable to a cloud deployment of Sepformer. This method charts a\nnew direction for performing efficient SS in practical scenarios.\n","authors":["Jia Qi Yip","Shengkui Zhao","Dianwen Ng","Eng Siong Chng","Bin Ma"],"pdf_url":"https://arxiv.org/pdf/2406.12434v2.pdf","comment":"This paper was accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2312.08888v3","updated":"2024-07-05T09:43:41Z","published":"2023-12-13T13:11:44Z","title":"Read Between the Layers: Leveraging Multi-Layer Representations for\n Rehearsal-Free Continual Learning with Pre-Trained Models","summary":" We address the Continual Learning (CL) problem, wherein a model must learn a\nsequence of tasks from non-stationary distributions while preserving prior\nknowledge upon encountering new experiences. With the advancement of foundation\nmodels, CL research has pivoted from the initial learning-from-scratch paradigm\ntowards utilizing generic features from large-scale pre-training. However,\nexisting approaches to CL with pre-trained models primarily focus on separating\nclass-specific features from the final representation layer and neglect the\npotential of intermediate representations to capture low- and mid-level\nfeatures, which are more invariant to domain shifts. In this work, we propose\nLayUP, a new prototype-based approach to CL that leverages second-order feature\nstatistics from multiple intermediate layers of a pre-trained network. Our\nmethod is conceptually simple, does not require access to prior data, and works\nout of the box with any foundation model. LayUP surpasses the state of the art\nin four of the seven class-incremental learning benchmarks, all three\ndomain-incremental learning benchmarks and in six of the seven online continual\nlearning benchmarks, while significantly reducing memory and computational\nrequirements compared to existing baselines. Our results demonstrate that fully\nexhausting the representational capacities of pre-trained models in CL goes\nwell beyond their final embeddings.\n","authors":["Kyra Ahrens","Hans Hergen Lehmann","Jae Hee Lee","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2312.08888v3.pdf","comment":"Accepted for publication in Transactions of Machine Learning Research\n (TMLR) journal"},{"id":"http://arxiv.org/abs/2402.09142v2","updated":"2024-07-05T09:42:01Z","published":"2024-02-14T12:48:17Z","title":"When Representations Align: Universality in Representation Learning\n Dynamics","summary":" Deep neural networks come in many sizes and architectures. The choice of\narchitecture, in conjunction with the dataset and learning algorithm, is\ncommonly understood to affect the learned neural representations. Yet, recent\nresults have shown that different architectures learn representations with\nstriking qualitative similarities. Here we derive an effective theory of\nrepresentation learning under the assumption that the encoding map from input\nto hidden representation and the decoding map from representation to output are\narbitrary smooth functions. This theory schematizes representation learning\ndynamics in the regime of complex, large architectures, where hidden\nrepresentations are not strongly constrained by the parametrization. We show\nthrough experiments that the effective theory describes aspects of\nrepresentation learning dynamics across a range of deep networks with different\nactivation functions and architectures, and exhibits phenomena similar to the\n\"rich\" and \"lazy\" regime. While many network behaviors depend quantitatively on\narchitecture, our findings point to certain behaviors that are widely conserved\nonce models are sufficiently flexible.\n","authors":["Loek van Rossem","Andrew M. Saxe"],"pdf_url":"https://arxiv.org/pdf/2402.09142v2.pdf","comment":"22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2403.00873v2","updated":"2024-07-05T09:36:26Z","published":"2024-03-01T07:41:05Z","title":"Blockchain-empowered Federated Learning: Benefits, Challenges, and\n Solutions","summary":" Federated learning (FL) is a distributed machine learning approach that\nprotects user data privacy by training models locally on clients and\naggregating them on a parameter server. While effective at preserving privacy,\nFL systems face limitations such as single points of failure, lack of\nincentives, and inadequate security. To address these challenges, blockchain\ntechnology is integrated into FL systems to provide stronger security,\nfairness, and scalability. However, blockchain-empowered FL (BC-FL) systems\nintroduce additional demands on network, computing, and storage resources. This\nsurvey provides a comprehensive review of recent research on BC-FL systems,\nanalyzing the benefits and challenges associated with blockchain integration.\nWe explore why blockchain is applicable to FL, how it can be implemented, and\nthe challenges and existing solutions for its integration. Additionally, we\noffer insights on future research directions for the BC-FL system.\n","authors":["Zeju Cai","Jianguo Chen","Yuting Fan","Zibin Zheng","Keqin Li"],"pdf_url":"https://arxiv.org/pdf/2403.00873v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2312.01472v2","updated":"2024-07-05T09:31:43Z","published":"2023-12-03T18:15:58Z","title":"BenchMARL: Benchmarking Multi-Agent Reinforcement Learning","summary":" The field of Multi-Agent Reinforcement Learning (MARL) is currently facing a\nreproducibility crisis. While solutions for standardized reporting have been\nproposed to address the issue, we still lack a benchmarking tool that enables\nstandardization and reproducibility, while leveraging cutting-edge\nReinforcement Learning (RL) implementations. In this paper, we introduce\nBenchMARL, the first MARL training library created to enable standardized\nbenchmarking across different algorithms, models, and environments. BenchMARL\nuses TorchRL as its backend, granting it high performance and maintained\nstate-of-the-art implementations while addressing the broad community of MARL\nPyTorch users. Its design enables systematic configuration and reporting, thus\nallowing users to create and run complex benchmarks from simple one-line\ninputs. BenchMARL is open-sourced on GitHub:\nhttps://github.com/facebookresearch/BenchMARL\n","authors":["Matteo Bettini","Amanda Prorok","Vincent Moens"],"pdf_url":"https://arxiv.org/pdf/2312.01472v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04370v1","updated":"2024-07-05T09:16:56Z","published":"2024-07-05T09:16:56Z","title":"Regulating Model Reliance on Non-Robust Features by Smoothing Input\n Marginal Density","summary":" Trustworthy machine learning necessitates meticulous regulation of model\nreliance on non-robust features. We propose a framework to delineate and\nregulate such features by attributing model predictions to the input. Within\nour approach, robust feature attributions exhibit a certain consistency, while\nnon-robust feature attributions are susceptible to fluctuations. This behavior\nallows identification of correlation between model reliance on non-robust\nfeatures and smoothness of marginal density of the input samples. Hence, we\nuniquely regularize the gradients of the marginal density w.r.t. the input\nfeatures for robustness. We also devise an efficient implementation of our\nregularization to address the potential numerical instability of the underlying\noptimization process. Moreover, we analytically reveal that, as opposed to our\nmarginal density smoothing, the prevalent input gradient regularization\nsmoothens conditional or joint density of the input, which can cause limited\nrobustness. Our experiments validate the effectiveness of the proposed method,\nproviding clear evidence of its capability to address the feature leakage\nproblem and mitigate spurious correlations. Extensive results further establish\nthat our technique enables the model to exhibit robustness against\nperturbations in pixel values, input gradients, and density.\n","authors":["Peiyu Yang","Naveed Akhtar","Mubarak Shah","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2407.04370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04358v1","updated":"2024-07-05T08:53:06Z","published":"2024-07-05T08:53:06Z","title":"An Adaptive Stochastic Gradient Method with Non-negative Gauss-Newton\n Stepsizes","summary":" We consider the problem of minimizing the average of a large number of smooth\nbut possibly non-convex functions. In the context of most machine learning\napplications, each loss function is non-negative and thus can be expressed as\nthe composition of a square and its real-valued square root. This reformulation\nallows us to apply the Gauss-Newton method, or the Levenberg-Marquardt method\nwhen adding a quadratic regularization. The resulting algorithm, while being\ncomputationally as efficient as the vanilla stochastic gradient method, is\nhighly adaptive and can automatically warmup and decay the effective stepsize\nwhile tracking the non-negative loss landscape. We provide a tight convergence\nanalysis, leveraging new techniques, in the stochastic convex and non-convex\nsettings. In particular, in the convex case, the method does not require access\nto the gradient Lipshitz constant for convergence, and is guaranteed to never\ndiverge. The convergence rates and empirical evaluations compare favorably to\nthe classical (stochastic) gradient method as well as to several other adaptive\nmethods.\n","authors":["Antonio Orvieto","Lin Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.04358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09180v3","updated":"2024-07-05T08:51:16Z","published":"2022-05-18T18:57:36Z","title":"Learning Rate Curriculum","summary":" Most curriculum learning methods require an approach to sort the data samples\nby difficulty, which is often cumbersome to perform. In this work, we propose a\nnovel curriculum learning approach termed Learning Rate Curriculum (LeRaC),\nwhich leverages the use of a different learning rate for each layer of a neural\nnetwork to create a data-agnostic curriculum during the initial training\nepochs. More specifically, LeRaC assigns higher learning rates to neural layers\ncloser to the input, gradually decreasing the learning rates as the layers are\nplaced farther away from the input. The learning rates increase at various\npaces during the first training iterations, until they all reach the same\nvalue. From this point on, the neural model is trained as usual. This creates a\nmodel-level curriculum learning strategy that does not require sorting the\nexamples by difficulty and is compatible with any neural network, generating\nhigher performance levels regardless of the architecture. We conduct\ncomprehensive experiments on 12 data sets from the computer vision (CIFAR-10,\nCIFAR-100, Tiny ImageNet, ImageNet-200, Food-101, UTKFace, PASCAL VOC),\nlanguage (BoolQ, QNLI, RTE) and audio (ESC-50, CREMA-D) domains, considering\nvarious convolutional (ResNet-18, Wide-ResNet-50, DenseNet-121, YOLOv5),\nrecurrent (LSTM) and transformer (CvT, BERT, SepTr) architectures. We compare\nour approach with the conventional training regime, as well as with Curriculum\nby Smoothing (CBS), a state-of-the-art data-agnostic curriculum learning\napproach. Unlike CBS, our performance improvements over the standard training\nregime are consistent across all data sets and models. Furthermore, we\nsignificantly surpass CBS in terms of training time (there is no additional\ncost over the standard training regime for LeRaC). Our code is freely available\nat: https://github.com/CroitoruAlin/LeRaC.\n","authors":["Florinel-Alin Croitoru","Nicolae-Catalin Ristea","Radu Tudor Ionescu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2205.09180v3.pdf","comment":"Accepted at the International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2306.09359v2","updated":"2024-07-05T08:46:17Z","published":"2023-06-13T09:29:22Z","title":"Deep Learning-Based Spatiotemporal Multi-Event Reconstruction for Delay\n Line Detectors","summary":" Accurate observation of two or more particles within a very narrow time\nwindow has always been a challenge in modern physics. It creates the\npossibility of correlation experiments, such as the ground-breaking Hanbury\nBrown-Twiss experiment, leading to new physical insights. For low-energy\nelectrons, one possibility is to use a microchannel plate with subsequent delay\nlines for the readout of the incident particle hits, a setup called a Delay\nLine Detector. The spatial and temporal coordinates of more than one particle\ncan be fully reconstructed outside a region called the dead radius. For\ninteresting events, where two electrons are close in space and time, the\ndetermination of the individual positions of the electrons requires elaborate\npeak finding algorithms. While classical methods work well with single particle\nhits, they fail to identify and reconstruct events caused by multiple nearby\nparticles. To address this challenge, we present a new spatiotemporal machine\nlearning model to identify and reconstruct the position and time of such\nmulti-hit particle signals. This model achieves a much better resolution for\nnearby particle hits compared to the classical approach, removing some of the\nartifacts and reducing the dead radius by half. We show that machine learning\nmodels can be effective in improving the spatiotemporal performance of delay\nline detectors.\n","authors":["Marco Knipfer","Stefan Meier","Jonas Heimerl","Peter Hommelhoff","Sergei Gleyzer"],"pdf_url":"https://arxiv.org/pdf/2306.09359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04352v1","updated":"2024-07-05T08:46:16Z","published":"2024-07-05T08:46:16Z","title":"UpStory: the Uppsala Storytelling dataset","summary":" Friendship and rapport play an important role in the formation of\nconstructive social interactions, and have been widely studied in educational\nsettings due to their impact on student outcomes. Given the growing interest in\nautomating the analysis of such phenomena through Machine Learning (ML), access\nto annotated interaction datasets is highly valuable. However, no dataset on\ndyadic child-child interactions explicitly capturing rapport currently exists.\nMoreover, despite advances in the automatic analysis of human behaviour, no\nprevious work has addressed the prediction of rapport in child-child dyadic\ninteractions in educational settings. We present UpStory -- the Uppsala\nStorytelling dataset: a novel dataset of naturalistic dyadic interactions\nbetween primary school aged children, with an experimental manipulation of\nrapport. Pairs of children aged 8-10 participate in a task-oriented activity:\ndesigning a story together, while being allowed free movement within the play\narea. We promote balanced collection of different levels of rapport by using a\nwithin-subjects design: self-reported friendships are used to pair each child\ntwice, either minimizing or maximizing pair separation in the friendship\nnetwork. The dataset contains data for 35 pairs, totalling 3h 40m of audio and\nvideo recordings. It includes two video sources covering the play area, as well\nas separate voice recordings for each child. An anonymized version of the\ndataset is made publicly available, containing per-frame head pose, body pose,\nand face features; as well as per-pair information, including the level of\nrapport. Finally, we provide ML baselines for the prediction of rapport.\n","authors":["Marc Fraile","Natalia Calvo-Barajas","Anastasia Sophia Apeiron","Giovanna Varni","Joakim Lindblad","Nataša Sladoje","Ginevra Castellano"],"pdf_url":"https://arxiv.org/pdf/2407.04352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04343v1","updated":"2024-07-05T08:34:49Z","published":"2024-07-05T08:34:49Z","title":"Enhancing Safety for Autonomous Agents in Partly Concealed Urban Traffic\n Environments Through Representation-Based Shielding","summary":" Navigating unsignalized intersections in urban environments poses a complex\nchallenge for self-driving vehicles, where issues such as view obstructions,\nunpredictable pedestrian crossings, and diverse traffic participants demand a\ngreat focus on crash prevention. In this paper, we propose a novel state\nrepresentation for Reinforcement Learning (RL) agents centered around the\ninformation perceivable by an autonomous agent, enabling the safe navigation of\npreviously uncharted road maps. Our approach surpasses several baseline models\nby a sig nificant margin in terms of safety and energy consumption metrics.\nThese improvements are achieved while maintaining a competitive average travel\nspeed. Our findings pave the way for more robust and reliable autonomous\nnavigation strategies, promising safer and more efficient urban traffic\nenvironments.\n","authors":["Pierre Haritz","David Wanke","Thomas Liebig"],"pdf_url":"https://arxiv.org/pdf/2407.04343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12325v2","updated":"2024-07-05T08:28:25Z","published":"2023-08-11T10:49:05Z","title":"FUTURE-AI: International consensus guideline for trustworthy and\n deployable artificial intelligence in healthcare","summary":" Despite major advances in artificial intelligence (AI) for medicine and\nhealthcare, the deployment and adoption of AI technologies remain limited in\nreal-world clinical practice. In recent years, concerns have been raised about\nthe technical, clinical, ethical and legal risks associated with medical AI. To\nincrease real world adoption, it is essential that medical AI tools are trusted\nand accepted by patients, clinicians, health organisations and authorities.\nThis work describes the FUTURE-AI guideline as the first international\nconsensus framework for guiding the development and deployment of trustworthy\nAI tools in healthcare. The FUTURE-AI consortium was founded in 2021 and\ncurrently comprises 118 inter-disciplinary experts from 51 countries\nrepresenting all continents, including AI scientists, clinicians, ethicists,\nand social scientists. Over a two-year period, the consortium defined guiding\nprinciples and best practices for trustworthy AI through an iterative process\ncomprising an in-depth literature review, a modified Delphi survey, and online\nconsensus meetings. The FUTURE-AI framework was established based on 6 guiding\nprinciples for trustworthy AI in healthcare, i.e. Fairness, Universality,\nTraceability, Usability, Robustness and Explainability. Through consensus, a\nset of 28 best practices were defined, addressing technical, clinical, legal\nand socio-ethical dimensions. The recommendations cover the entire lifecycle of\nmedical AI, from design, development and validation to regulation, deployment,\nand monitoring. FUTURE-AI is a risk-informed, assumption-free guideline which\nprovides a structured approach for constructing medical AI tools that will be\ntrusted, deployed and adopted in real-world practice. Researchers are\nencouraged to take the recommendations into account in proof-of-concept stages\nto facilitate future translation towards clinical practice of medical AI.\n","authors":["Karim Lekadir","Aasa Feragen","Abdul Joseph Fofanah","Alejandro F Frangi","Alena Buyx","Anais Emelie","Andrea Lara","Antonio R Porras","An-Wen Chan","Arcadi Navarro","Ben Glocker","Benard O Botwe","Bishesh Khanal","Brigit Beger","Carol C Wu","Celia Cintas","Curtis P Langlotz","Daniel Rueckert","Deogratias Mzurikwao","Dimitrios I Fotiadis","Doszhan Zhussupov","Enzo Ferrante","Erik Meijering","Eva Weicken","Fabio A González","Folkert W Asselbergs","Fred Prior","Gabriel P Krestin","Gary Collins","Geletaw S Tegenaw","Georgios Kaissis","Gianluca Misuraca","Gianna Tsakou","Girish Dwivedi","Haridimos Kondylakis","Harsha Jayakody","Henry C Woodruf","Hugo JWL Aerts","Ian Walsh","Ioanna Chouvarda","Irène Buvat","Islem Rekik","James Duncan","Jayashree Kalpathy-Cramer","Jihad Zahir","Jinah Park","John Mongan","Judy W Gichoya","Julia A Schnabel","Kaisar Kushibar","Katrine Riklund","Kensaku Mori","Kostas Marias","Lameck M Amugongo","Lauren A Fromont","Lena Maier-Hein","Leonor Cerdá Alberich","Leticia Rittner","Lighton Phiri","Linda Marrakchi-Kacem","Lluís Donoso-Bach","Luis Martí-Bonmatí","M Jorge Cardoso","Maciej Bobowicz","Mahsa Shabani","Manolis Tsiknakis","Maria A Zuluaga","Maria Bielikova","Marie-Christine Fritzsche","Marius George Linguraru","Markus Wenzel","Marleen De Bruijne","Martin G Tolsgaard","Marzyeh Ghassemi","Md Ashrafuzzaman","Melanie Goisauf","Mohammad Yaqub","Mohammed Ammar","Mónica Cano Abadía","Mukhtar M E Mahmoud","Mustafa Elattar","Nicola Rieke","Nikolaos Papanikolaou","Noussair Lazrak","Oliver Díaz","Olivier Salvado","Oriol Pujol","Ousmane Sall","Pamela Guevara","Peter Gordebeke","Philippe Lambin","Pieta Brown","Purang Abolmaesumi","Qi Dou","Qinghua Lu","Richard Osuala","Rose Nakasi","S Kevin Zhou","Sandy Napel","Sara Colantonio","Shadi Albarqouni","Smriti Joshi","Stacy Carter","Stefan Klein","Steffen E Petersen","Susanna Aussó","Suyash Awate","Tammy Riklin Raviv","Tessa Cook","Tinashe E M Mutsvangwa","Wendy A Rogers","Wiro J Niessen","Xènia Puig-Bosch","Yi Zeng","Yunusa G Mohammed","Yves Saint James Aquino","Zohaib Salahuddin","Martijn P A Starmans"],"pdf_url":"https://arxiv.org/pdf/2309.12325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09926v2","updated":"2024-07-05T08:23:00Z","published":"2023-12-15T16:31:44Z","title":"FuXi-S2S: A machine learning model that outperforms conventional global\n subseasonal forecast models","summary":" Skillful subseasonal forecasts are crucial for various sectors of society but\npose a grand scientific challenge. Recently, machine learning based weather\nforecasting models outperform the most successful numerical weather predictions\ngenerated by the European Centre for Medium-Range Weather Forecasts (ECMWF),\nbut have not yet surpassed conventional models at subseasonal timescales. This\npaper introduces FuXi Subseasonal-to-Seasonal (FuXi-S2S), a machine learning\nmodel that provides global daily mean forecasts up to 42 days, encompassing\nfive upper-air atmospheric variables at 13 pressure levels and 11 surface\nvariables. FuXi-S2S, trained on 72 years of daily statistics from ECMWF ERA5\nreanalysis data, outperforms the ECMWF's state-of-the-art\nSubseasonal-to-Seasonal model in ensemble mean and ensemble forecasts for total\nprecipitation and outgoing longwave radiation, notably enhancing global\nprecipitation forecast. The improved performance of FuXi-S2S can be primarily\nattributed to its superior capability to capture forecast uncertainty and\naccurately predict the Madden-Julian Oscillation (MJO), extending the skillful\nMJO prediction from 30 days to 36 days. Moreover, FuXi-S2S not only captures\nrealistic teleconnections associated with the MJO, but also emerges as a\nvaluable tool for discovering precursor signals, offering researchers insights\nand potentially establishing a new paradigm in Earth system science research.\n","authors":["Lei Chen","Xiaohui Zhong","Hao Li","Jie Wu","Bo Lu","Deliang Chen","Shangping Xie","Qingchen Chao","Chensen Lin","Zixin Hu","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2312.09926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04335v1","updated":"2024-07-05T08:20:27Z","published":"2024-07-05T08:20:27Z","title":"Geometrically Inspired Kernel Machines for Collaborative Learning Beyond\n Gradient Descent","summary":" This paper develops a novel mathematical framework for collaborative learning\nby means of geometrically inspired kernel machines which includes statements on\nthe bounds of generalisation and approximation errors, and sample complexity.\nFor classification problems, this approach allows us to learn bounded geometric\nstructures around given data points and hence solve the global model learning\nproblem in an efficient way by exploiting convexity properties of the related\noptimisation problem in a Reproducing Kernel Hilbert Space (RKHS). In this way,\nwe can reduce classification problems to determining the closest bounded\ngeometric structure from a given data point. Further advantages that come with\nour solution is that our approach does not require clients to perform multiple\nepochs of local optimisation using stochastic gradient descent, nor require\nrounds of communication between client/server for optimising the global model.\nWe highlight that numerous experiments have shown that the proposed method is a\ncompetitive alternative to the state-of-the-art.\n","authors":["Mohit Kumar","Alexander Valentinitsch","Magdalena Fuchs","Mathias Brucker","Juliana Bowles","Adnan Husakovic","Ali Abbas","Bernhard A. Moser"],"pdf_url":"https://arxiv.org/pdf/2407.04335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04334v1","updated":"2024-07-05T08:19:36Z","published":"2024-07-05T08:19:36Z","title":"Learning Geometric Invariant Features for Classification of Vector\n Polygons with Graph Message-passing Neural Network","summary":" Geometric shape classification of vector polygons remains a non-trivial\nlearning task in spatial analysis. Previous studies mainly focus on devising\ndeep learning approaches for representation learning of rasterized vector\npolygons, whereas the study of discrete representations of polygons and\nsubsequent deep learning approaches have not been fully investigated. In this\nstudy, we investigate a graph representation of vector polygons and propose a\nnovel graph message-passing neural network (PolyMP) to learn the\ngeometric-invariant features for shape classification of polygons. Through\nextensive experiments, we show that the graph representation of polygons\ncombined with a permutation-invariant graph message-passing neural network\nachieves highly robust performances on benchmark datasets (i.e., synthetic\nglyph and real-world building footprint datasets) as compared to baseline\nmethods. We demonstrate that the proposed graph-based PolyMP network enables\nthe learning of expressive geometric features invariant to geometric\ntransformations of polygons (i.e., translation, rotation, scaling and shearing)\nand is robust to trivial vertex removals of polygons. We further show the\nstrong generalizability of PolyMP, which enables generalizing the learned\ngeometric features from the synthetic glyph polygons to the real-world building\nfootprints.\n","authors":["Zexian Huang","Kourosh Khoshelham","Martin Tomko"],"pdf_url":"https://arxiv.org/pdf/2407.04334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09631v6","updated":"2024-07-05T08:14:29Z","published":"2024-02-15T00:20:30Z","title":"Representation Surgery: Theory and Practice of Affine Steering","summary":" Language models often exhibit undesirable behavior, e.g., generating toxic or\ngender-biased text. In the case of neural language models, an encoding of the\nundesirable behavior is often present in the model's representations. Thus, one\nnatural (and common) approach to prevent the model from exhibiting undesirable\nbehavior is to steer the model's representations in a manner that reduces the\nprobability of it generating undesirable text. This paper investigates the\nformal and empirical properties of steering functions, i.e., transformation of\nthe neural language model's representations that alter its behavior. First, we\nderive two optimal, in the least-squares sense, affine steering functions under\ndifferent constraints. Our theory provides justification for existing\napproaches and offers a novel, improved steering approach. Second, we offer a\nseries of experiments that demonstrate the empirical effectiveness of the\nmethods in mitigating bias and reducing toxic generation.\n","authors":["Shashwat Singh","Shauli Ravfogel","Jonathan Herzig","Roee Aharoni","Ryan Cotterell","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2402.09631v6.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2405.18580v2","updated":"2024-07-05T08:02:59Z","published":"2024-05-28T20:54:41Z","title":"Artificial Intelligence in Industry 4.0: A Review of Integration\n Challenges for Industrial Systems","summary":" In Industry 4.0, Cyber-Physical Systems (CPS) generate vast data sets that\ncan be leveraged by Artificial Intelligence (AI) for applications including\npredictive maintenance and production planning. However, despite the\ndemonstrated potential of AI, its widespread adoption in sectors like\nmanufacturing remains limited. Our comprehensive review of recent literature,\nincluding standards and reports, pinpoints key challenges: system integration,\ndata-related issues, managing workforce-related concerns and ensuring\ntrustworthy AI. A quantitative analysis highlights particular challenges and\ntopics that are important for practitioners but still need to be sufficiently\ninvestigated by academics. The paper briefly discusses existing solutions to\nthese challenges and proposes avenues for future research. We hope that this\nsurvey serves as a resource for practitioners evaluating the cost-benefit\nimplications of AI in CPS and for researchers aiming to address these urgent\nchallenges.\n","authors":["Alexander Windmann","Philipp Wittenberg","Marvin Schieseck","Oliver Niggemann"],"pdf_url":"https://arxiv.org/pdf/2405.18580v2.pdf","comment":"17 pages, 4 figures, 1 table, accepted for the 22nd IEEE\n International Conference on Industrial Informatics (INDIN)"},{"id":"http://arxiv.org/abs/2407.04328v1","updated":"2024-07-05T08:01:19Z","published":"2024-07-05T08:01:19Z","title":"EAGERx: Graph-Based Framework for Sim2real Robot Learning","summary":" Sim2real, that is, the transfer of learned control policies from simulation\nto real world, is an area of growing interest in robotics due to its potential\nto efficiently handle complex tasks. The sim2real approach faces challenges due\nto mismatches between simulation and reality. These discrepancies arise from\ninaccuracies in modeling physical phenomena and asynchronous control, among\nother factors. To this end, we introduce EAGERx, a framework with a unified\nsoftware pipeline for both real and simulated robot learning. It can support\nvarious simulators and aids in integrating state, action and time-scale\nabstractions to facilitate learning. EAGERx's integrated delay simulation,\ndomain randomization features, and proposed synchronization algorithm\ncontribute to narrowing the sim2real gap. We demonstrate (in the context of\nrobot learning and beyond) the efficacy of EAGERx in accommodating diverse\nrobotic systems and maintaining consistent simulation behavior. EAGERx is open\nsource and its code is available at https://eagerx.readthedocs.io.\n","authors":["Bas van der Heijden","Jelle Luijkx","Laura Ferranti","Jens Kober","Robert Babuska"],"pdf_url":"https://arxiv.org/pdf/2407.04328v1.pdf","comment":"For an introductory video, see\n http://www.youtube.com/watch?v=D0CQNnTT010 . The documentation, tutorials,\n and our open-source code can be found at http://eagerx.readthedocs.io"},{"id":"http://arxiv.org/abs/2406.18397v2","updated":"2024-07-05T07:59:57Z","published":"2024-06-26T14:44:24Z","title":"Second Maximum of a Gaussian Random Field and Exact (t-)Spacing test","summary":" In this article, we introduce the novel concept of the second maximum of a\nGaussian random field on a Riemannian submanifold. This second maximum serves\nas a powerful tool for characterizing the distribution of the maximum. By\nutilizing an ad-hoc Kac Rice formula, we derive the explicit form of the\nmaximum's distribution, conditioned on the second maximum and some regressed\ncomponent of the Riemannian Hessian. This approach results in an exact test,\nbased on the evaluation of spacing between these maxima, which we refer to as\nthe spacing test.\n We investigate the applicability of this test in detecting sparse\nalternatives within Gaussian symmetric tensors, continuous sparse\ndeconvolution, and two-layered neural networks with smooth rectifiers. Our\ntheoretical results are supported by numerical experiments, which illustrate\nthe calibration and power of the proposed tests. More generally, this test can\nbe applied to any Gaussian random field on a Riemannian manifold, and we\nprovide a general framework for the application of the spacing test in\ncontinuous sparse kernel regression.\n Furthermore, when the variance-covariance function of the Gaussian random\nfield is known up to a scaling factor, we derive an exact Studentized version\nof our test, coined the $t$-spacing test. This test is perfectly calibrated\nunder the null hypothesis and has high power for detecting sparse alternatives.\n","authors":["Jean-Marc Azaïs","Federico Dalmao","Yohann De Castro"],"pdf_url":"https://arxiv.org/pdf/2406.18397v2.pdf","comment":"5 figures, 22 pages main document, 2 pages supplements"},{"id":"http://arxiv.org/abs/2309.01213v3","updated":"2024-07-05T07:59:14Z","published":"2023-09-03T16:35:59Z","title":"Implicit regularization of deep residual networks towards neural ODEs","summary":" Residual neural networks are state-of-the-art deep learning models. Their\ncontinuous-depth analog, neural ordinary differential equations (ODEs), are\nalso widely used. Despite their success, the link between the discrete and\ncontinuous models still lacks a solid mathematical foundation. In this article,\nwe take a step in this direction by establishing an implicit regularization of\ndeep residual networks towards neural ODEs, for nonlinear networks trained with\ngradient flow. We prove that if the network is initialized as a discretization\nof a neural ODE, then such a discretization holds throughout training. Our\nresults are valid for a finite training time, and also as the training time\ntends to infinity provided that the network satisfies a Polyak-Lojasiewicz\ncondition. Importantly, this condition holds for a family of residual networks\nwhere the residuals are two-layer perceptrons with an overparameterization in\nwidth that is only linear, and implies the convergence of gradient flow to a\nglobal minimum. Numerical experiments illustrate our results.\n","authors":["Pierre Marion","Yu-Han Wu","Michael E. Sander","Gérard Biau"],"pdf_url":"https://arxiv.org/pdf/2309.01213v3.pdf","comment":"ICLR 2024 (spotlight). 40 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.04325v1","updated":"2024-07-05T07:53:52Z","published":"2024-07-05T07:53:52Z","title":"Understanding the Role of Invariance in Transfer Learning","summary":" Transfer learning is a powerful technique for knowledge-sharing between\ndifferent tasks. Recent work has found that the representations of models with\ncertain invariances, such as to adversarial input perturbations, achieve higher\nperformance on downstream tasks. These findings suggest that invariance may be\nan important property in the context of transfer learning. However, the\nrelationship of invariance with transfer performance is not fully understood\nyet and a number of questions remain. For instance, how important is invariance\ncompared to other factors of the pretraining task? How transferable is learned\ninvariance? In this work, we systematically investigate the importance of\nrepresentational invariance for transfer learning, as well as how it interacts\nwith other parameters during pretraining. To do so, we introduce a family of\nsynthetic datasets that allow us to precisely control factors of variation both\nin training and test data. Using these datasets, we a) show that for learning\nrepresentations with high transfer performance, invariance to the right\ntransformations is as, or often more, important than most other factors such as\nthe number of training samples, the model architecture and the identity of the\npretraining classes, b) show conditions under which invariance can harm the\nability to transfer representations and c) explore how transferable invariance\nis between tasks. The code is available at\n\\url{https://github.com/tillspeicher/representation-invariance-transfer}.\n","authors":["Till Speicher","Vedant Nanda","Krishna P. Gummadi"],"pdf_url":"https://arxiv.org/pdf/2407.04325v1.pdf","comment":"Published at TMLR 2024"},{"id":"http://arxiv.org/abs/2402.10609v2","updated":"2024-07-05T07:49:01Z","published":"2024-02-16T11:54:34Z","title":"MRPD: Undersampled MRI reconstruction by prompting a large latent\n diffusion model","summary":" Implicit visual knowledge in a large latent diffusion model (LLDM)\npre-trained on natural images is rich and hypothetically universal to natural\nand medical images. To test this hypothesis from a practical perspective, we\npropose a novel framework for undersampled MRI Reconstruction by Prompting a\nlarge latent Diffusion model (MRPD). While the existing methods trained on MRI\ndatasets are typically of limited generalizability toward diverse data\nacquisition scenarios, MRPD supports unsupervised and universally adaptive MRI\nreconstruction. For unsupervised reconstruction, MRSampler guides LLDM with a\nrandom-phase-modulated hard-to-soft control. With any single- or\nmultiple-source MRI dataset, MRPD's performance is boosted universally by a\nlightweight MRAdapter that only finetunes the LLDM's autoencoder. Experiments\non FastMRI and IXI show that MRPD is the only model that supports both MRI\ndatabase-free and database-available scenarios and attains the best\ngeneralizability towards out-of-domain (OOD) samplings, contrasts, and organs\namong compared unsupervised, supervised, and MRI diffusion methods. To our\nknowledge, MRPD is the first method that empirically shows the universal\nprowess of an LLDM pre-trained on vast natural images for MRI. Our official\nimplementation is at https://github.com/Z7Gao/MRPD.\n","authors":["Ziqi Gao","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.10609v2.pdf","comment":"10 pages, 5 figures, 7 tables, 1 pseudocode"},{"id":"http://arxiv.org/abs/2405.18805v2","updated":"2024-07-05T07:45:16Z","published":"2024-05-29T06:47:45Z","title":"Semiring Activation in Neural Networks","summary":" We introduce a class of trainable nonlinear operators based on semirings that\nare suitable for use in neural networks. These operators generalize the\ntraditional alternation of linear operators with activation functions in neural\nnetworks. Semirings are algebraic structures that describe a generalised\nnotation of linearity, greatly expanding the range of trainable operators that\ncan be included in neural networks. In fact, max- or min-pooling operations are\nconvolutions in the tropical semiring with a fixed kernel.\n We perform experiments where we replace the activation functions for\ntrainable semiring-based operators to show that these are viable operations to\ninclude in fully connected as well as convolutional neural networks (ConvNeXt).\nWe discuss some of the challenges of replacing traditional activation functions\nwith trainable semiring activations and the trade-offs of doing so.\n","authors":["Bart M. N. Smets","Peter D. Donker","Jim W. Portegies","Remco Duits"],"pdf_url":"https://arxiv.org/pdf/2405.18805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05551v4","updated":"2024-07-05T07:42:40Z","published":"2023-07-09T09:08:38Z","title":"Graph Neural Networks as an Enabler of Terahertz-based Flow-guided\n Nanoscale Localization over Highly Erroneous Raw Data","summary":" Contemporary research advances in nanotechnology and material science are\nrooted in the emergence of nanodevices as a versatile tool that harmonizes\nsensing, computing, wireless communication, data storage, and energy\nharvesting. These devices offer novel pathways for disease diagnostics,\ntreatment, and monitoring within the bloodstreams. Ensuring precise\nlocalization of events of diagnostic interest, which underpins the concept of\nflow-guided in-body nanoscale localization, would provide an added diagnostic\nvalue to the detected events. Raw data generated by the nanodevices is pivotal\nfor this localization and consist of an event detection indicator and the time\nelapsed since the last passage of a nanodevice through the heart. The energy\nconstraints of the nanodevices lead to intermittent operation and unreliable\ncommunication, intrinsically affecting this data. This posits a need for\ncomprehensively modelling the features of this data. These imperfections also\nhave profound implications for the viability of existing flow-guided\nlocalization approaches, which are ill-prepared to address the intricacies of\nthe environment. Our first contribution lies in an analytical model of raw data\nfor flow-guided localization, dissecting how communication and energy\ncapabilities influence the nanodevices' data output. This model acts as a vital\nbridge, reconciling idealized assumptions with practical challenges of\nflow-guided localization. Toward addressing these practical challenges, we also\npresent an integration of Graph Neural Networks (GNNs) into the flow-guided\nlocalization paradigm. GNNs excel in capturing complex dynamic interactions\ninherent to the localization of events sensed by the nanodevices. Our results\nhighlight the potential of GNNs not only to enhance localization accuracy but\nalso extend coverage to encompass the entire bloodstream.\n","authors":["Gerard Calvo Bartra","Filip Lemic","Guillem Pascual","Aina Pérez Rodas","Jakob Struye","Carmen Delgado","Xavier Costa Pérez"],"pdf_url":"https://arxiv.org/pdf/2307.05551v4.pdf","comment":"16 pages, 16 figures, 6 tables, 45 references"},{"id":"http://arxiv.org/abs/2406.14191v2","updated":"2024-07-05T07:38:02Z","published":"2024-06-20T10:51:06Z","title":"Temporal Knowledge Graph Question Answering: A Survey","summary":" Knowledge Base Question Answering (KBQA) has been a long-standing field to\nanswer questions based on knowledge bases. Recently, the evolving dynamics of\nknowledge have attracted a growing interest in Temporal Knowledge Graph\nQuestion Answering (TKGQA), an emerging task to answer temporal questions.\nHowever, this field grapples with ambiguities in defining temporal questions\nand lacks a systematic categorization of existing methods for TKGQA. In\nresponse, this paper provides a thorough survey from two perspectives: the\ntaxonomy of temporal questions and the methodological categorization for TKGQA.\nSpecifically, we first establish a detailed taxonomy of temporal questions\nengaged in prior studies. Subsequently, we provide a comprehensive review of\nTKGQA techniques of two categories: semantic parsing-based and TKG\nembedding-based. Building on this review, the paper outlines potential research\ndirections aimed at advancing the field of TKGQA. This work aims to serve as a\ncomprehensive reference for TKGQA and to stimulate further research.\n","authors":["Miao Su","Zixuan Li","Zhuo Chen","Long Bai","Xiaolong Jin","Jiafeng Guo"],"pdf_url":"https://arxiv.org/pdf/2406.14191v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.16520v2","updated":"2024-07-05T07:32:01Z","published":"2024-01-29T19:50:50Z","title":"MT-HCCAR: Multi-Task Deep Learning with Hierarchical Classification and\n Attention-based Regression for Cloud Property Retrieval","summary":" In the realm of Earth science, effective cloud property retrieval,\nencompassing cloud masking, cloud phase classification, and cloud optical\nthickness (COT) prediction, remains pivotal. Traditional methodologies\nnecessitate distinct models for each sensor instrument due to their unique\nspectral characteristics. Recent strides in Earth Science research have\nembraced machine learning and deep learning techniques to extract features from\nsatellite datasets' spectral observations. However, prevailing approaches lack\nnovel architectures accounting for hierarchical relationships among retrieval\ntasks. Moreover, considering the spectral diversity among existing sensors, the\ndevelopment of models with robust generalization capabilities over different\nsensor datasets is imperative. Surprisingly, there is a dearth of methodologies\naddressing the selection of an optimal model for diverse datasets. In response,\nthis paper introduces MT-HCCAR, an end-to-end deep learning model employing\nmulti-task learning to simultaneously tackle cloud masking, cloud phase\nretrieval (classification tasks), and COT prediction (a regression task). The\nMT-HCCAR integrates a hierarchical classification network (HC) and a\nclassification-assisted attention-based regression network (CAR), enhancing\nprecision and robustness in cloud labeling and COT prediction. Additionally, a\ncomprehensive model selection method rooted in K-fold cross-validation, one\nstandard error rule, and two introduced performance scores is proposed to\nselect the optimal model over three simulated satellite datasets OCI, VIIRS,\nand ABI. The experiments comparing MT-HCCAR with baseline methods, the ablation\nstudies, and the model selection affirm the superiority and the generalization\ncapabilities of MT-HCCAR.\n","authors":["Xingyan Li","Andrew M. Sayer","Ian T. Carroll","Xin Huang","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2401.16520v2.pdf","comment":"14 pages, 3 figures, accepted by ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2407.04308v1","updated":"2024-07-05T07:23:51Z","published":"2024-07-05T07:23:51Z","title":"SSP-GNN: Learning to Track via Bilevel Optimization","summary":" We propose a graph-based tracking formulation for multi-object tracking (MOT)\nwhere target detections contain kinematic information and re-identification\nfeatures (attributes). Our method applies a successive shortest paths (SSP)\nalgorithm to a tracking graph defined over a batch of frames. The edge costs in\nthis tracking graph are computed via a message-passing network, a graph neural\nnetwork (GNN) variant. The parameters of the GNN, and hence, the tracker, are\nlearned end-to-end on a training set of example ground-truth tracks and\ndetections. Specifically, learning takes the form of bilevel optimization\nguided by our novel loss function. We evaluate our algorithm on simulated\nscenarios to understand its sensitivity to scenario aspects and model\nhyperparameters. Across varied scenario complexities, our method compares\nfavorably to a strong baseline.\n","authors":["Griffin Golias","Masa Nakura-Fan","Vitaly Ablavsky"],"pdf_url":"https://arxiv.org/pdf/2407.04308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04307v1","updated":"2024-07-05T07:22:44Z","published":"2024-07-05T07:22:44Z","title":"Crafting Large Language Models for Enhanced Interpretability","summary":" We introduce the Concept Bottleneck Large Language Model (CB-LLM), a\npioneering approach to creating inherently interpretable Large Language Models\n(LLMs). Unlike traditional black-box LLMs that rely on post-hoc interpretation\nmethods with limited neuron function insights, CB-LLM sets a new standard with\nits built-in interpretability, scalability, and ability to provide clear,\naccurate explanations. This innovation not only advances transparency in\nlanguage models but also enhances their effectiveness. Our unique Automatic\nConcept Correction (ACC) strategy successfully narrows the performance gap with\nconventional black-box LLMs, positioning CB-LLM as a model that combines the\nhigh accuracy of traditional LLMs with the added benefit of clear\ninterpretability -- a feature markedly absent in existing LLMs.\n","authors":["Chung-En Sun","Tuomas Oikarinen","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2407.04307v1.pdf","comment":"Present at ICML 2024 Mechanistic Interpretability (MI) Workshop"},{"id":"http://arxiv.org/abs/2407.04302v1","updated":"2024-07-05T07:10:26Z","published":"2024-07-05T07:10:26Z","title":"Fair Federated Data Clustering through Personalization: Bridging the Gap\n between Diverse Data Distributions","summary":" The rapid growth of data from edge devices has catalyzed the performance of\nmachine learning algorithms. However, the data generated resides at client\ndevices thus there are majorly two challenge faced by traditional machine\nlearning paradigms - centralization of data for training and secondly for most\nthe generated data the class labels are missing and there is very poor\nincentives to clients to manually label their data owing to high cost and lack\nof expertise. To overcome these issues, there have been initial attempts to\nhandle unlabelled data in a privacy preserving distributed manner using\nunsupervised federated data clustering. The goal is partition the data\navailable on clients into $k$ partitions (called clusters) without actual\nexchange of data. Most of the existing algorithms are highly dependent on data\ndistribution patterns across clients or are computationally expensive.\nFurthermore, due to presence of skewed nature of data across clients in most of\npractical scenarios existing models might result in clients suffering high\nclustering cost making them reluctant to participate in federated process. To\nthis, we are first to introduce the idea of personalization in federated\nclustering. The goal is achieve balance between achieving lower clustering cost\nand at same time achieving uniform cost across clients. We propose p-FClus that\naddresses these goal in a single round of communication between server and\nclients. We validate the efficacy of p-FClus against variety of federated\ndatasets showcasing it's data independence nature, applicability to any finite\n$\\ell$-norm, while simultaneously achieving lower cost and variance.\n","authors":["Shivam Gupta"," Tarushi","Tsering Wangzes","Shweta Jain"],"pdf_url":"https://arxiv.org/pdf/2407.04302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12694v5","updated":"2024-07-05T07:04:25Z","published":"2024-02-20T03:45:59Z","title":"Revitalizing Multivariate Time Series Forecasting: Learnable\n Decomposition with Inter-Series Dependencies and Intra-Series Variations\n Modeling","summary":" Predicting multivariate time series is crucial, demanding precise modeling of\nintricate patterns, including inter-series dependencies and intra-series\nvariations. Distinctive trend characteristics in each time series pose\nchallenges, and existing methods, relying on basic moving average kernels, may\nstruggle with the non-linear structure and complex trends in real-world data.\nGiven that, we introduce a learnable decomposition strategy to capture dynamic\ntrend information more reasonably. Additionally, we propose a dual attention\nmodule tailored to capture inter-series dependencies and intra-series\nvariations simultaneously for better time series forecasting, which is\nimplemented by channel-wise self-attention and autoregressive self-attention.\nTo evaluate the effectiveness of our method, we conducted experiments across\neight open-source datasets and compared it with the state-of-the-art methods.\nThrough the comparison results, our Leddam (LEarnable Decomposition and Dual\nAttention Module) not only demonstrates significant advancements in predictive\nperformance, but also the proposed decomposition strategy can be plugged into\nother methods with a large performance-boosting, from 11.87% to 48.56% MSE\nerror degradation.\n","authors":["Guoqi Yu","Jing Zou","Xiaowei Hu","Angelica I. Aviles-Rivero","Jing Qin","Shujun Wang"],"pdf_url":"https://arxiv.org/pdf/2402.12694v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04295v1","updated":"2024-07-05T06:57:30Z","published":"2024-07-05T06:57:30Z","title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey","summary":" Large Language Models (LLMs) have performed exceptionally in various\ntext-generative tasks, including question answering, translation, code\ncompletion, etc. However, the over-assistance of LLMs has raised the challenge\nof \"jailbreaking\", which induces the model to generate malicious responses\nagainst the usage policy and society by designing adversarial prompts. With the\nemergence of jailbreak attack methods exploiting different vulnerabilities in\nLLMs, the corresponding safety alignment measures are also evolving. In this\npaper, we propose a comprehensive and detailed taxonomy of jailbreak attack and\ndefense methods. For instance, the attack methods are divided into black-box\nand white-box attacks based on the transparency of the target model. Meanwhile,\nwe classify defense methods into prompt-level and model-level defenses.\nAdditionally, we further subdivide these attack and defense methods into\ndistinct sub-classes and present a coherent diagram illustrating their\nrelationships. We also conduct an investigation into the current evaluation\nmethods and compare them from different perspectives. Our findings aim to\ninspire future research and practical implementations in safeguarding LLMs\nagainst adversarial attacks. Above all, although jailbreak remains a\nsignificant concern within the community, we believe that our work enhances the\nunderstanding of this domain and provides a foundation for developing more\nsecure LLMs.\n","authors":["Sibo Yi","Yule Liu","Zhen Sun","Tianshuo Cong","Xinlei He","Jiaxing Song","Ke Xu","Qi Li"],"pdf_url":"https://arxiv.org/pdf/2407.04295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04291v1","updated":"2024-07-05T06:54:24Z","published":"2024-07-05T06:54:24Z","title":"We Need Variations in Speech Synthesis: Sub-center Modelling for Speaker\n Embeddings","summary":" In speech synthesis, modeling of rich emotions and prosodic variations\npresent in human voice are crucial to synthesize natural speech. Although\nspeaker embeddings have been widely used in personalized speech synthesis as\nconditioning inputs, they are designed to lose variation to optimize speaker\nrecognition accuracy. Thus, they are suboptimal for speech synthesis in terms\nof modeling the rich variations at the output speech distribution. In this\nwork, we propose a novel speaker embedding network which utilizes multiple\nclass centers in the speaker classification training rather than a single class\ncenter as traditional embeddings. The proposed approach introduces variations\nin the speaker embedding while retaining the speaker recognition performance\nsince model does not have to map all of the utterances of a speaker into a\nsingle class center. We apply our proposed embedding in voice conversion task\nand show that our method provides better naturalness and prosody in synthesized\nspeech.\n","authors":["Ismail Rasim Ulgen","Carlos Busso","John H. L. Hansen","Berrak Sisman"],"pdf_url":"https://arxiv.org/pdf/2407.04291v1.pdf","comment":"Submitted to IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2405.05925v2","updated":"2024-07-05T06:48:50Z","published":"2024-05-09T17:15:09Z","title":"FuXi-ENS: A machine learning model for medium-range ensemble weather\n forecasting","summary":" Ensemble forecasting is crucial for improving weather predictions, especially\nfor forecasts of extreme events. Constructing an ensemble prediction system\n(EPS) based on conventional NWP models is highly computationally expensive. ML\nmodels have emerged as valuable tools for deterministic weather forecasts,\nproviding forecasts with significantly reduced computational requirements and\neven surpassing the forecast performance of traditional NWP models. However,\nchallenges arise when applying ML models to ensemble forecasting. Recent ML\nmodels, such as GenCast and SEEDS model, rely on the ERA5 EDA or operational\nNWP ensemble members for forecast generation. Their spatial resolution is also\nconsidered too coarse for many applications. To overcome these limitations, we\nintroduce FuXi-ENS, an advanced ML model designed to deliver 6-hourly global\nensemble weather forecasts up to 15 days. This model runs at a significantly\nincreased spatial resolution of 0.25\\textdegree, incorporating 5 atmospheric\nvariables at 13 pressure levels, along with 13 surface variables. By leveraging\nthe inherent probabilistic nature of Variational AutoEncoder (VAE), FuXi-ENS\noptimizes a loss function that combines the CRPS and the KL divergence between\nthe predicted and target distribution, facilitating the incorporation of\nflow-dependent perturbations in both initial conditions and forecast. This\ninnovative approach makes FuXi-ENS an advancement over the traditional ones\nthat use L1 loss combined with the KL loss in standard VAE models for ensemble\nweather forecasting. Results demonstrate that FuXi-ENS outperforms ensemble\nforecasts from the ECMWF, a world leading NWP model, in the CRPS of 98.1% of\n360 variable and forecast lead time combinations. This achievement underscores\nthe potential of the FuXi-ENS model to enhance ensemble weather forecasts,\noffering a promising direction for further development in this field.\n","authors":["Xiaohui Zhong","Lei Chen","Hao Li","Jun Liu","Xu Fan","Jie Feng","Kan Dai","Jing-Jia Luo","Jie Wu","Yuan Qi","Bo Lu"],"pdf_url":"https://arxiv.org/pdf/2405.05925v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08311v2","updated":"2024-07-05T06:44:33Z","published":"2024-06-12T15:12:49Z","title":"Causality for Tabular Data Synthesis: A High-Order Structure Causal\n Benchmark Framework","summary":" Tabular synthesis models remain ineffective at capturing complex\ndependencies, and the quality of synthetic data is still insufficient for\ncomprehensive downstream tasks, such as prediction under distribution shifts,\nautomated decision-making, and cross-table understanding. A major challenge is\nthe lack of prior knowledge about underlying structures and high-order\nrelationships in tabular data. We argue that a systematic evaluation on\nhigh-order structural information for tabular data synthesis is the first step\ntowards solving the problem. In this paper, we introduce high-order structural\ncausal information as natural prior knowledge and provide a benchmark framework\nfor the evaluation of tabular synthesis models. The framework allows us to\ngenerate benchmark datasets with a flexible range of data generation processes\nand to train tabular synthesis models using these datasets for further\nevaluation. We propose multiple benchmark tasks, high-order metrics, and causal\ninference tasks as downstream tasks for evaluating the quality of synthetic\ndata generated by the trained models. Our experiments demonstrate to leverage\nthe benchmark framework for evaluating the model capability of capturing\nhigh-order structural causal information. Furthermore, our benchmarking results\nprovide an initial assessment of state-of-the-art tabular synthesis models.\nThey have clearly revealed significant gaps between ideal and actual\nperformance and how baseline methods differ. Our benchmark framework is\navailable at URL https://github.com/TURuibo/CauTabBench.\n","authors":["Ruibo Tu","Zineb Senane","Lele Cao","Cheng Zhang","Hedvig Kjellström","Gustav Eje Henter"],"pdf_url":"https://arxiv.org/pdf/2406.08311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04285v1","updated":"2024-07-05T06:34:32Z","published":"2024-07-05T06:34:32Z","title":"Robust Decision Transformer: Tackling Data Corruption in Offline RL via\n Sequence Modeling","summary":" Learning policies from offline datasets through offline reinforcement\nlearning (RL) holds promise for scaling data-driven decision-making and\navoiding unsafe and costly online interactions. However, real-world data\ncollected from sensors or humans often contains noise and errors, posing a\nsignificant challenge for existing offline RL methods. Our study indicates that\ntraditional offline RL methods based on temporal difference learning tend to\nunderperform Decision Transformer (DT) under data corruption, especially when\nthe amount of data is limited. This suggests the potential of sequential\nmodeling for tackling data corruption in offline RL. To further unleash the\npotential of sequence modeling methods, we propose Robust Decision Transformer\n(RDT) by incorporating several robust techniques. Specifically, we introduce\nGaussian weighted learning and iterative data correction to reduce the effect\nof corrupted data. Additionally, we leverage embedding dropout to enhance the\nmodel's resistance to erroneous inputs. Extensive experiments on MoJoCo,\nKitChen, and Adroit tasks demonstrate RDT's superior performance under diverse\ndata corruption compared to previous methods. Moreover, RDT exhibits remarkable\nrobustness in a challenging setting that combines training-time data corruption\nwith testing-time observation perturbations. These results highlight the\npotential of robust sequence modeling for learning from noisy or corrupted\noffline datasets, thereby promoting the reliable application of offline RL in\nreal-world tasks.\n","authors":["Jiawei Xu","Rui Yang","Feng Luo","Meng Fang","Baoxiang Wang","Lei Han"],"pdf_url":"https://arxiv.org/pdf/2407.04285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12359v3","updated":"2024-07-05T06:26:45Z","published":"2023-11-21T05:27:16Z","title":"Shedding the Bits: Pushing the Boundaries of Quantization with\n Minifloats on FPGAs","summary":" Post-training quantization (PTQ) is a powerful technique for model\ncompression, reducing the numerical precision in neural networks without\nadditional training overhead. Recent works have investigated adopting 8-bit\nfloating-point formats(FP8) in the context of PTQ for model inference. However,\nfloating-point formats smaller than 8 bits and their relative comparison in\nterms of accuracy-hardware cost with integers remains unexplored on FPGAs. In\nthis work, we present minifloats, which are reduced-precision floating-point\nformats capable of further reducing the memory footprint, latency, and energy\ncost of a model while approaching full-precision model accuracy. We implement a\ncustom FPGA-based multiply-accumulate operator library and explore the vast\ndesign space, comparing minifloat and integer representations across 3 to 8\nbits for both weights and activations. We also examine the applicability of\nvarious integerbased quantization techniques to minifloats. Our experiments\nshow that minifloats offer a promising alternative for emerging workloads such\nas vision transformers.\n","authors":["Shivam Aggarwal","Hans Jakob Damsgaard","Alessandro Pappalardo","Giuseppe Franco","Thomas B. Preußer","Michaela Blott","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.12359v3.pdf","comment":"Accepted in FPL (International Conference on Field-Programmable Logic\n and Applications) 2024 conference. Revised with updated results"},{"id":"http://arxiv.org/abs/2407.04279v1","updated":"2024-07-05T06:25:34Z","published":"2024-07-05T06:25:34Z","title":"BiosERC: Integrating Biography Speakers Supported by LLMs for ERC Tasks","summary":" In the Emotion Recognition in Conversation task, recent investigations have\nutilized attention mechanisms exploring relationships among utterances from\nintra- and inter-speakers for modeling emotional interaction between them.\nHowever, attributes such as speaker personality traits remain unexplored and\npresent challenges in terms of their applicability to other tasks or\ncompatibility with diverse model architectures. Therefore, this work introduces\na novel framework named BiosERC, which investigates speaker characteristics in\na conversation. By employing Large Language Models (LLMs), we extract the\n\"biographical information\" of the speaker within a conversation as\nsupplementary knowledge injected into the model to classify emotional labels\nfor each utterance. Our proposed method achieved state-of-the-art (SOTA)\nresults on three famous benchmark datasets: IEMOCAP, MELD, and EmoryNLP,\ndemonstrating the effectiveness and generalization of our model and showcasing\nits potential for adaptation to various conversation analysis tasks. Our source\ncode is available at https://github.com/yingjie7/BiosERC.\n","authors":["Jieying Xue","Minh Phuong Nguyen","Blake Matheny","Le Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.04279v1.pdf","comment":"Accepted in the 33rd International Conference on Artificial Neural\n Networks (ICANN 2024)"},{"id":"http://arxiv.org/abs/2402.04894v2","updated":"2024-07-05T06:07:43Z","published":"2024-02-07T14:24:41Z","title":"Deep Reinforcement Learning with Dynamic Graphs for Adaptive Informative\n Path Planning","summary":" Autonomous robots are often employed for data collection due to their\nefficiency and low labour costs. A key task in robotic data acquisition is\nplanning paths through an initially unknown environment to collect observations\ngiven platform-specific resource constraints, such as limited battery life.\nAdaptive online path planning in 3D environments is challenging due to the\nlarge set of valid actions and the presence of unknown occlusions. To address\nthese issues, we propose a novel deep reinforcement learning approach for\nadaptively replanning robot paths to map targets of interest in unknown 3D\nenvironments. A key aspect of our approach is a dynamically constructed graph\nthat restricts planning actions local to the robot, allowing us to react to\nnewly discovered static obstacles and targets of interest. For replanning, we\npropose a new reward function that balances between exploring the unknown\nenvironment and exploiting online-discovered targets of interest. Our\nexperiments show that our method enables more efficient target discovery\ncompared to state-of-the-art learning and non-learning baselines. We also\nshowcase our approach for orchard monitoring using an unmanned aerial vehicle\nin a photorealistic simulator. We open-source our code and model at:\nhttps://github.com/dmar-bonn/ipp-rl-3d.\n","authors":["Apoorva Vashisth","Julius Rückin","Federico Magistri","Cyrill Stachniss","Marija Popović"],"pdf_url":"https://arxiv.org/pdf/2402.04894v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.04272v1","updated":"2024-07-05T05:55:18Z","published":"2024-07-05T05:55:18Z","title":"Accelerating Communication in Deep Learning Recommendation Model\n Training with Dual-Level Adaptive Lossy Compression","summary":" DLRM is a state-of-the-art recommendation system model that has gained\nwidespread adoption across various industry applications. The large size of\nDLRM models, however, necessitates the use of multiple devices/GPUs for\nefficient training. A significant bottleneck in this process is the\ntime-consuming all-to-all communication required to collect embedding data from\nall devices. To mitigate this, we introduce a method that employs error-bounded\nlossy compression to reduce the communication data size and accelerate DLRM\ntraining. We develop a novel error-bounded lossy compression algorithm,\ninformed by an in-depth analysis of embedding data features, to achieve high\ncompression ratios. Moreover, we introduce a dual-level adaptive strategy for\nerror-bound adjustment, spanning both table-wise and iteration-wise aspects, to\nbalance the compression benefits with the potential impacts on accuracy. We\nfurther optimize our compressor for PyTorch tensors on GPUs, minimizing\ncompression overhead. Evaluation shows that our method achieves a 1.38$\\times$\ntraining speedup with a minimal accuracy impact.\n","authors":["Hao Feng","Boyuan Zhang","Fanjiang Ye","Min Si","Ching-Hsiang Chu","Jiannan Tian","Chunxing Yin"," Zhaoxia"," Deng","Yuchen Hao","Pavan Balaji","Tong Geng","Dingwen Tao"],"pdf_url":"https://arxiv.org/pdf/2407.04272v1.pdf","comment":"accepted by SC '24"},{"id":"http://arxiv.org/abs/2407.04271v1","updated":"2024-07-05T05:52:51Z","published":"2024-07-05T05:52:51Z","title":"Variational Partial Group Convolutions for Input-Aware Partial\n Equivariance of Rotations and Color-Shifts","summary":" Group Equivariant CNNs (G-CNNs) have shown promising efficacy in various\ntasks, owing to their ability to capture hierarchical features in an\nequivariant manner. However, their equivariance is fixed to the symmetry of the\nwhole group, limiting adaptability to diverse partial symmetries in real-world\ndatasets, such as limited rotation symmetry of handwritten digit images and\nlimited color-shift symmetry of flower images. Recent efforts address this\nlimitation, one example being Partial G-CNN which restricts the output group\nspace of convolution layers to break full equivariance. However, such an\napproach still fails to adjust equivariance levels across data. In this paper,\nwe propose a novel approach, Variational Partial G-CNN (VP G-CNN), to capture\nvarying levels of partial equivariance specific to each data instance. VP G-CNN\nredesigns the distribution of the output group elements to be conditioned on\ninput data, leveraging variational inference to avoid overfitting. This enables\nthe model to adjust its equivariance levels according to the needs of\nindividual data points. Additionally, we address training instability inherent\nin discrete group equivariance models by redesigning the reparametrizable\ndistribution. We demonstrate the effectiveness of VP G-CNN on both toy and\nreal-world datasets, including MNIST67-180, CIFAR10, ColorMNIST, and\nFlowers102. Our results show robust performance, even in uncertainty metrics.\n","authors":["Hyunsu Kim","Yegon Kim","Hongseok Yang","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2407.04271v1.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2407.04268v1","updated":"2024-07-05T05:45:34Z","published":"2024-07-05T05:45:34Z","title":"NeuFair: Neural Network Fairness Repair with Dropout","summary":" This paper investigates the neural dropout method as a post-processing bias\nmitigation for deep neural networks (DNNs). Neural-driven software solutions\nare increasingly applied in socially critical domains with significant fairness\nimplications. While neural networks are exceptionally good at finding\nstatistical patterns from data, they are notorious for overfitting to the\ntraining datasets that may encode and amplify existing biases from the\nhistorical data. Existing bias mitigation algorithms often require either\nmodifying the input dataset or modifying the learning algorithms. We posit that\nthe prevalent dropout methods that prevent over-fitting during training by\nrandomly dropping neurons may be an effective and less intrusive approach to\nimprove fairness of pre-trained DNNs. However, finding the ideal set of neurons\nto drop is a combinatorial problem. We propose NeuFair, a family of\npost-processing randomized algorithms that mitigate unfairness in pre-trained\nDNNs. Our randomized search is guided by an objective to minimize\ndiscrimination while maintaining the model utility. We show that our design of\nrandomized algorithms provides statistical guarantees on finding optimal\nsolutions, and we empirically evaluate the efficacy and efficiency of NeuFair\nin improving fairness, with minimal or no performance degradation. Our results\nshow that NeuFair improves fairness by up to 69% and outperforms\nstate-of-the-art post-processing bias techniques.\n","authors":["Vishnu Asutosh Dasu","Ashish Kumar","Saeid Tizpaz-Niari","Gang Tan"],"pdf_url":"https://arxiv.org/pdf/2407.04268v1.pdf","comment":"Paper accepted at ACM ISSTA 2024"},{"id":"http://arxiv.org/abs/2407.04264v1","updated":"2024-07-05T05:34:10Z","published":"2024-07-05T05:34:10Z","title":"Langevin Dynamics: A Unified Perspective on Optimization via Lyapunov\n Potentials","summary":" We study the problem of non-convex optimization using Stochastic Gradient\nLangevin Dynamics (SGLD). SGLD is a natural and popular variation of stochastic\ngradient descent where at each step, appropriately scaled Gaussian noise is\nadded. To our knowledge, the only strategy for showing global convergence of\nSGLD on the loss function is to show that SGLD can sample from a stationary\ndistribution which assigns larger mass when the function is small (the Gibbs\nmeasure), and then to convert these guarantees to optimization results.\n We employ a new strategy to analyze the convergence of SGLD to global minima,\nbased on Lyapunov potentials and optimization. We convert the same mild\nconditions from previous works on SGLD into geometric properties based on\nLyapunov potentials. This adapts well to the case with a stochastic gradient\noracle, which is natural for machine learning applications where one wants to\nminimize population loss but only has access to stochastic gradients via\nminibatch training samples. Here we provide 1) improved rates in the setting of\nprevious works studying SGLD for optimization, 2) the first finite gradient\ncomplexity guarantee for SGLD where the function is Lipschitz and the Gibbs\nmeasure defined by the function satisfies a Poincar\\'e Inequality, and 3) prove\nif continuous-time Langevin Dynamics succeeds for optimization, then\ndiscrete-time SGLD succeeds under mild regularity assumptions.\n","authors":["August Y. Chen","Ayush Sekhari","Karthik Sridharan"],"pdf_url":"https://arxiv.org/pdf/2407.04264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04259v1","updated":"2024-07-05T05:19:36Z","published":"2024-07-05T05:19:36Z","title":"Robust Q-Learning for finite ambiguity sets","summary":" In this paper we propose a novel $Q$-learning algorithm allowing to solve\ndistributionally robust Markov decision problems for which the ambiguity set of\nprobability measures can be chosen arbitrarily as long as it comprises only a\nfinite amount of measures. Therefore, our approach goes beyond the well-studied\ncases involving ambiguity sets of balls around some reference measure with the\ndistance to reference measure being measured with respect to the Wasserstein\ndistance or the Kullback--Leibler divergence. Hence, our approach allows the\napplicant to create ambiguity sets better tailored to her needs and to solve\nthe associated robust Markov decision problem via a $Q$-learning algorithm\nwhose convergence is guaranteed by our main result. Moreover, we showcase in\nseveral numerical experiments the tractability of our approach.\n","authors":["Cécile Decker","Julian Sester"],"pdf_url":"https://arxiv.org/pdf/2407.04259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04258v1","updated":"2024-07-05T05:08:06Z","published":"2024-07-05T05:08:06Z","title":"Unsupervised Video Summarization via Reinforcement Learning and a\n Trained Evaluator","summary":" This paper presents a novel approach for unsupervised video summarization\nusing reinforcement learning. It aims to address the existing limitations of\ncurrent unsupervised methods, including unstable training of adversarial\ngenerator-discriminator architectures and reliance on hand-crafted reward\nfunctions for quality evaluation. The proposed method is based on the concept\nthat a concise and informative summary should result in a reconstructed video\nthat closely resembles the original. The summarizer model assigns an importance\nscore to each frame and generates a video summary. In the proposed scheme,\nreinforcement learning, coupled with a unique reward generation pipeline, is\nemployed to train the summarizer model. The reward generation pipeline trains\nthe summarizer to create summaries that lead to improved reconstructions. It\ncomprises a generator model capable of reconstructing masked frames from a\npartially masked video, along with a reward mechanism that compares the\nreconstructed video from the summary against the original. The video generator\nis trained in a self-supervised manner to reconstruct randomly masked frames,\nenhancing its ability to generate accurate summaries. This training pipeline\nresults in a summarizer model that better mimics human-generated video\nsummaries compared to methods relying on hand-crafted rewards. The training\nprocess consists of two stable and isolated training steps, unlike adversarial\narchitectures. Experimental results demonstrate promising performance, with\nF-scores of 62.3 and 54.5 on TVSum and SumMe datasets, respectively.\nAdditionally, the inference stage is 300 times faster than our previously\nreported state-of-the-art method.\n","authors":["Mehryar Abbasi","Hadi Hadizadeh","Parvaneh Saeedi"],"pdf_url":"https://arxiv.org/pdf/2407.04258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19320v3","updated":"2024-07-05T04:59:42Z","published":"2024-05-29T17:51:42Z","title":"Value-Incentivized Preference Optimization: A Unified Approach to Online\n and Offline RLHF","summary":" Reinforcement learning from human feedback (RLHF) has demonstrated great\npromise in aligning large language models (LLMs) with human preference.\nDepending on the availability of preference data, both online and offline RLHF\nare active areas of investigation. A key bottleneck is understanding how to\nincorporate uncertainty estimation in the reward function learned from the\npreference data for RLHF, regardless of how the preference data is collected.\nWhile the principles of optimism or pessimism under uncertainty are\nwell-established in standard reinforcement learning (RL), a\npractically-implementable and theoretically-grounded form amenable to large\nlanguage models is not yet available, as standard techniques for constructing\nconfidence intervals become intractable under arbitrary policy\nparameterizations.\n In this paper, we introduce a unified approach to online and offline RLHF --\nvalue-incentivized preference optimization (VPO) -- which regularizes the\nmaximum-likelihood estimate of the reward function with the corresponding value\nfunction, modulated by a $\\textit{sign}$ to indicate whether the optimism or\npessimism is chosen. VPO also directly optimizes the policy with implicit\nreward modeling, and therefore shares a simpler RLHF pipeline similar to direct\npreference optimization. Theoretical guarantees of VPO are provided for both\nonline and offline settings, matching the rates of their standard RL\ncounterparts. Moreover, experiments on text summarization and dialog verify the\npracticality and effectiveness of VPO.\n","authors":["Shicong Cen","Jincheng Mei","Katayoon Goshvadi","Hanjun Dai","Tong Yang","Sherry Yang","Dale Schuurmans","Yuejie Chi","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2405.19320v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04251v1","updated":"2024-07-05T04:38:17Z","published":"2024-07-05T04:38:17Z","title":"Unified Interpretation of Smoothing Methods for Negative Sampling Loss\n Functions in Knowledge Graph Embedding","summary":" Knowledge Graphs (KGs) are fundamental resources in knowledge-intensive tasks\nin NLP. Due to the limitation of manually creating KGs, KG Completion (KGC) has\nan important role in automatically completing KGs by scoring their links with\nKG Embedding (KGE). To handle many entities in training, KGE relies on Negative\nSampling (NS) loss that can reduce the computational cost by sampling. Since\nthe appearance frequencies for each link are at most one in KGs, sparsity is an\nessential and inevitable problem. The NS loss is no exception. As a solution,\nthe NS loss in KGE relies on smoothing methods like Self-Adversarial Negative\nSampling (SANS) and subsampling. However, it is uncertain what kind of\nsmoothing method is suitable for this purpose due to the lack of theoretical\nunderstanding. This paper provides theoretical interpretations of the smoothing\nmethods for the NS loss in KGE and induces a new NS loss, Triplet Adaptive\nNegative Sampling (TANS), that can cover the characteristics of the\nconventional smoothing methods. Experimental results of TransE, DistMult,\nComplEx, RotatE, HAKE, and HousE on FB15k-237, WN18RR, and YAGO3-10 datasets\nand their sparser subsets show the soundness of our interpretation and\nperformance improvement by our TANS.\n","authors":["Xincan Feng","Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.04251v1.pdf","comment":"9 pages, 4 figures, 2 tables; accepted to workshop RepL4NLP held in\n conjunction with ACL 2024"},{"id":"http://arxiv.org/abs/2407.04248v1","updated":"2024-07-05T04:30:41Z","published":"2024-07-05T04:30:41Z","title":"Machine Learning for Complex Systems with Abnormal Pattern by Exception\n Maximization Outlier Detection Method","summary":" This paper proposes a novel fast online methodology for outlier detection\ncalled the exception maximization outlier detection method(EMODM), which\nemploys probabilistic models and statistical algorithms to detect abnormal\npatterns from the outputs of complex systems. The EMODM is based on a two-state\nGaussian mixture model and demonstrates strong performance in probability\nanomaly detection working on real-time raw data rather than using special prior\ndistribution information. We confirm this using the synthetic data from two\nnumerical cases. For the real-world data, we have detected the short circuit\npattern of the circuit system using EMODM by the current and voltage output of\na three-phase inverter. The EMODM also found an abnormal period due to COVID-19\nin the insured unemployment data of 53 regions in the United States from 2000\nto 2024. The application of EMODM to these two real-life datasets demonstrated\nthe effectiveness and accuracy of our algorithm.\n","authors":["Zhikun Zhang","Yiting Duan","Xiangjun Wang","Mingyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.04248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00835v3","updated":"2024-07-05T04:06:23Z","published":"2023-07-03T08:19:00Z","title":"Engression: Extrapolation through the Lens of Distributional Regression","summary":" Distributional regression aims to estimate the full conditional distribution\nof a target variable, given covariates. Popular methods include linear and\ntree-ensemble based quantile regression. We propose a neural network-based\ndistributional regression methodology called `engression'. An engression model\nis generative in the sense that we can sample from the fitted conditional\ndistribution and is also suitable for high-dimensional outcomes. Furthermore,\nwe find that modelling the conditional distribution on training data can\nconstrain the fitted function outside of the training support, which offers a\nnew perspective to the challenging extrapolation problem in nonlinear\nregression. In particular, for `pre-additive noise' models, where noise is\nadded to the covariates before applying a nonlinear transformation, we show\nthat engression can successfully perform extrapolation under some assumptions\nsuch as monotonicity, whereas traditional regression approaches such as\nleast-squares or quantile regression fall short under the same assumptions. Our\nempirical results, from both simulated and real data, validate the\neffectiveness of the engression method and indicate that the pre-additive noise\nmodel is typically suitable for many real-world scenarios. The software\nimplementations of engression are available in both R and Python.\n","authors":["Xinwei Shen","Nicolai Meinshausen"],"pdf_url":"https://arxiv.org/pdf/2307.00835v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04240v1","updated":"2024-07-05T03:56:40Z","published":"2024-07-05T03:56:40Z","title":"A Two-Step Minimax Q-learning Algorithm for Two-Player Zero-Sum Markov\n Games","summary":" An interesting iterative procedure is proposed to solve a two-player zero-sum\nMarkov games. First this problem is expressed as a min-max Markov game. Next, a\ntwo-step Q-learning algorithm for solving Markov decision problem (MDP) is\nsuitably modified to solve this Markov game. Under a suitable assumption, the\nboundedness of the proposed iterates is obtained theoretically. Using results\nfrom stochastic approximation, the almost sure convergence of the proposed\ntwo-step minimax Q-learning is obtained theoretically. More specifically, the\nproposed algorithm converges to the game theoretic optimal value with\nprobability one, when the model information is not known. Numerical simulation\nauthenticate that the proposed algorithm is effective and easy to implement.\n","authors":["Shreyas S R","Antony Vijesh"],"pdf_url":"https://arxiv.org/pdf/2407.04240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13451v3","updated":"2024-07-05T03:35:09Z","published":"2023-08-25T15:53:30Z","title":"Gotta match 'em all: Solution diversification in graph matching matched\n filters","summary":" We present a novel approach for finding multiple noisily embedded template\ngraphs in a very large background graph. Our method builds upon the\ngraph-matching-matched-filter technique proposed in Sussman et al., with the\ndiscovery of multiple diverse matchings being achieved by iteratively\npenalizing a suitable node-pair similarity matrix in the matched filter\nalgorithm. In addition, we propose algorithmic speed-ups that greatly enhance\nthe scalability of our matched-filter approach. We present theoretical\njustification of our methodology in the setting of correlated Erdos-Renyi\ngraphs, showing its ability to sequentially discover multiple templates under\nmild model conditions. We additionally demonstrate our method's utility via\nextensive experiments both using simulated models and real-world dataset,\ninclude human brain connectomes and a large transactional knowledge base.\n","authors":["Zhirui Li","Ben Johnson","Daniel L. Sussman","Carey E. Priebe","Vince Lyzinski"],"pdf_url":"https://arxiv.org/pdf/2308.13451v3.pdf","comment":"27 pages, 12 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.04236v1","updated":"2024-07-05T03:26:37Z","published":"2024-07-05T03:26:37Z","title":"Graph Pooling via Ricci Flow","summary":" Graph Machine Learning often involves the clustering of nodes based on\nsimilarity structure encoded in the graph's topology and the nodes' attributes.\nOn homophilous graphs, the integration of pooling layers has been shown to\nenhance the performance of Graph Neural Networks by accounting for inherent\nmulti-scale structure. Here, similar nodes are grouped together to coarsen the\ngraph and reduce the input size in subsequent layers in deeper architectures.\nIn both settings, the underlying clustering approach can be implemented via\ngraph pooling operators, which often rely on classical tools from Graph Theory.\nIn this work, we introduce a graph pooling operator (ORC-Pool), which utilizes\na characterization of the graph's geometry via Ollivier's discrete Ricci\ncurvature and an associated geometric flow. Previous Ricci flow based\nclustering approaches have shown great promise across several domains, but are\nby construction unable to account for similarity structure encoded in the node\nattributes. However, in many ML applications, such information is vital for\ndownstream tasks. ORC-Pool extends such clustering approaches to attributed\ngraphs, allowing for the integration of geometric coarsening into Graph Neural\nNetworks as a pooling layer.\n","authors":["Amy Feng","Melanie Weber"],"pdf_url":"https://arxiv.org/pdf/2407.04236v1.pdf","comment":"32 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.01906v2","updated":"2024-07-05T03:23:59Z","published":"2024-07-02T03:11:13Z","title":"Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning for\n Sparse Architectural Large Language Models","summary":" Parameter-efficient fine-tuning (PEFT) is crucial for customizing Large\nLanguage Models (LLMs) with constrained resources. Although there have been\nvarious PEFT methods for dense-architecture LLMs, PEFT for sparse-architecture\nLLMs is still underexplored. In this work, we study the PEFT method for LLMs\nwith the Mixture-of-Experts (MoE) architecture and the contents of this work\nare mainly threefold: (1) We investigate the dispersion degree of the activated\nexperts in customized tasks, and found that the routing distribution for a\nspecific task tends to be highly concentrated, while the distribution of\nactivated experts varies significantly across different tasks. (2) We propose\nExpert-Specialized Fine-Tuning, or ESFT, which tunes the experts most relevant\nto downstream tasks while freezing the other experts and modules; experimental\nresults demonstrate that our method not only improves the tuning efficiency,\nbut also matches or even surpasses the performance of full-parameter\nfine-tuning. (3) We further analyze the impact of the MoE architecture on\nexpert-specialized fine-tuning. We find that MoE models with finer-grained\nexperts are more advantageous in selecting the combination of experts that are\nmost relevant to downstream tasks, thereby enhancing both the training\nefficiency and effectiveness. Our code is available at\nhttps://github.com/deepseek-ai/ESFT.\n","authors":["Zihan Wang","Deli Chen","Damai Dai","Runxin Xu","Zhuoshu Li","Y. Wu"],"pdf_url":"https://arxiv.org/pdf/2407.01906v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02700v2","updated":"2024-07-05T03:11:17Z","published":"2024-05-04T16:06:50Z","title":"Identification of Novel Modes in Generative Models via Fourier-based\n Differential Clustering","summary":" An interpretable comparison of generative models requires the identification\nof sample types produced more frequently by each of the involved models. While\nseveral quantitative scores have been proposed in the literature to rank\ndifferent generative models, such score-based evaluations do not reveal the\nnuanced differences between the generative models in capturing various sample\ntypes. In this work, we attempt to solve a differential clustering problem to\ndetect sample types expressed differently by two generative models. To solve\nthe differential clustering problem, we propose a method called Fourier-based\nIdentification of Novel Clusters (FINC) to identify modes produced by a\ngenerative model with a higher frequency in comparison to a reference\ndistribution. FINC provides a scalable stochastic algorithm based on random\nFourier features to estimate the eigenspace of kernel covariance matrices of\ntwo generative models and utilize the principal eigendirections to detect the\nsample types present more dominantly in each model. We demonstrate the\napplication of the FINC method to large-scale computer vision datasets and\ngenerative model frameworks. Our numerical results suggest the scalability of\nthe developed Fourier-based method in highlighting the sample types produced\nwith different frequencies by widely-used generative models. Code is available\nat \\url{https://github.com/buyeah1109/FINC}\n","authors":["Jingwei Zhang","Mohammad Jalali","Cheuk Ting Li","Farzan Farnia"],"pdf_url":"https://arxiv.org/pdf/2405.02700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18861v2","updated":"2024-07-05T03:03:45Z","published":"2024-06-27T03:16:09Z","title":"Predicting the duration of traffic incidents for Sydney greater\n metropolitan area using machine learning methods","summary":" This research presents a comprehensive approach to predicting the duration of\ntraffic incidents and classifying them as short-term or long-term across the\nSydney Metropolitan Area. Leveraging a dataset that encompasses detailed\nrecords of traffic incidents, road network characteristics, and socio-economic\nindicators, we train and evaluate a variety of advanced machine learning models\nincluding Gradient Boosted Decision Trees (GBDT), Random Forest, LightGBM, and\nXGBoost. The models are assessed using Root Mean Square Error (RMSE) for\nregression tasks and F1 score for classification tasks.\n Our experimental results demonstrate that XGBoost and LightGBM outperform\nconventional models with XGBoost achieving the lowest RMSE of 33.7 for\npredicting incident duration and highest classification F1 score of 0.62 for a\n30-minute duration threshold. For classification, the 30-minute threshold\nbalances performance with 70.84% short-term duration classification accuracy\nand 62.72% long-term duration classification accuracy. Feature importance\nanalysis, employing both tree split counts and SHAP values, identifies the\nnumber of affected lanes, traffic volume, and types of primary and secondary\nvehicles as the most influential features.\n The proposed methodology not only achieves high predictive accuracy but also\nprovides stakeholders with vital insights into factors contributing to incident\ndurations. These insights enable more informed decision-making for traffic\nmanagement and response strategies. The code is available by the link:\nhttps://github.com/Future-Mobility-Lab/SydneyIncidents\n","authors":["Artur Grigorev","Sajjad Shafiei","Hanna Grzybowska","Adriana-Simona Mihaita"],"pdf_url":"https://arxiv.org/pdf/2406.18861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01695v2","updated":"2024-07-05T02:49:47Z","published":"2024-01-24T03:11:36Z","title":"Language-Guided World Models: A Model-Based Approach to AI Control","summary":" This paper introduces the concept of Language-Guided World Models (LWMs) --\nprobabilistic models that can simulate environments by reading texts. Agents\nequipped with these models provide humans with more extensive and efficient\ncontrol, allowing them to simultaneously alter agent behaviors in multiple\ntasks via natural verbal communication. In this work, we take initial steps in\ndeveloping robust LWMs that can generalize to compositionally novel language\ndescriptions. We design a challenging world modeling benchmark based on the\ngame of MESSENGER (Hanjie et al., 2021), featuring evaluation settings that\nrequire varying degrees of compositional generalization. Our experiments reveal\nthe lack of generalizability of the state-of-the-art Transformer model, as it\noffers marginal improvements in simulation quality over a no-text baseline. We\ndevise a more robust model by fusing the Transformer with the EMMA attention\nmechanism (Hanjie et al., 2021). Our model substantially outperforms the\nTransformer and approaches the performance of a model with an oracle semantic\nparsing and grounding capability. To demonstrate the practicality of this model\nin improving AI safety and transparency, we simulate a scenario in which the\nmodel enables an agent to present plans to a human before execution, and to\nrevise plans based on their language feedback.\n","authors":["Alex Zhang","Khanh Nguyen","Jens Tuyls","Albert Lin","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2402.01695v2.pdf","comment":"SpLU-RoboNLP workshop at ACL 2024"},{"id":"http://arxiv.org/abs/2405.20579v2","updated":"2024-07-05T02:11:54Z","published":"2024-05-31T02:17:51Z","title":"HOPE: A Reinforcement Learning-based Hybrid Policy Path Planner for\n Diverse Parking Scenarios","summary":" Automated parking stands as a highly anticipated application of autonomous\ndriving technology. However, existing path planning methodologies fall short of\naddressing this need due to their incapability to handle the diverse and\ncomplex parking scenarios in reality. While non-learning methods provide\nreliable planning results, they are vulnerable to intricate occasions, whereas\nlearning-based ones are good at exploration but unstable in converging to\nfeasible solutions. To leverage the strengths of both approaches, we introduce\nHybrid pOlicy Path plannEr (HOPE). This novel solution integrates a\nreinforcement learning agent with Reeds-Shepp curves, enabling effective\nplanning across diverse scenarios. HOPE guides the exploration of the\nreinforcement learning agent by applying an action mask mechanism and employs a\ntransformer to integrate the perceived environmental information with the mask.\nTo facilitate the training and evaluation of the proposed planner, we propose a\ncriterion for categorizing the difficulty level of parking scenarios based on\nspace and obstacle distribution. Experimental results demonstrate that our\napproach outperforms typical rule-based algorithms and traditional\nreinforcement learning methods, showing higher planning success rates and\ngeneralization across various scenarios. We also conduct real-world experiments\nto verify the practicability of HOPE. The code for our solution will be openly\navailable on \\href{GitHub}{https://github.com/jiamiya/HOPE}.\n","authors":["Mingyang Jiang","Yueyuan Li","Songan Zhang","Siyuan Chen","Chunxiang Wang","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2405.20579v2.pdf","comment":"10 pages, 6 tables, 5 figures, 4 page appendix"},{"id":"http://arxiv.org/abs/2407.04211v1","updated":"2024-07-05T01:47:20Z","published":"2024-07-05T01:47:20Z","title":"TimeLDM: Latent Diffusion Model for Unconditional Time Series Generation","summary":" Time series generation is a crucial research topic in the area of deep\nlearning, which can be used for data augmentation, imputing missing values, and\nforecasting. Currently, latent diffusion models are ascending to the forefront\nof generative modeling for many important data representations. Being the most\npivotal in the computer vision domain, latent diffusion models have also\nrecently attracted interest in other communities, including NLP, Speech, and\nGeometric Space. In this work, we propose TimeLDM, a novel latent diffusion\nmodel for high-quality time series generation. TimeLDM is composed of a\nvariational autoencoder that encodes time series into an informative and\nsmoothed latent content and a latent diffusion model operating in the latent\nspace to generate latent information. We evaluate the ability of our method to\ngenerate synthetic time series with simulated and realistic datasets, benchmark\nthe performance against existing state-of-the-art methods. Qualitatively and\nquantitatively, we find that the proposed TimeLDM persistently delivers\nhigh-quality generated time series. Sores from Context-FID and Discriminative\nindicate that TimeLDM consistently and significantly outperforms current\nstate-of-the-art benchmarks with an average improvement of 3.4$\\times$ and\n3.8$\\times$, respectively. Further studies demonstrate that our method presents\nbetter performance on different lengths of time series data generation. To the\nbest of our knowledge, this is the first study to explore the potential of the\nlatent diffusion model for unconditional time series generation and establish a\nnew baseline for synthetic time series.\n","authors":["Jian Qian","Miao Sun","Sifan Zhou","Biao Wan","Minhao Li","Patrick Chiang"],"pdf_url":"https://arxiv.org/pdf/2407.04211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16664v2","updated":"2024-07-05T01:18:30Z","published":"2023-08-31T12:12:56Z","title":"What can we learn from quantum convolutional neural networks?","summary":" We can learn from analyzing quantum convolutional neural networks (QCNNs)\nthat: 1) working with quantum data can be perceived as embedding physical\nsystem parameters through a hidden feature map; 2) their high performance for\nquantum phase recognition can be attributed to generation of a very suitable\nbasis set during the ground state embedding, where quantum criticality of spin\nmodels leads to basis functions with rapidly changing features; 3) pooling\nlayers of QCNNs are responsible for picking those basis functions that can\ncontribute to forming a high-performing decision boundary, and the learning\nprocess corresponds to adapting the measurement such that few-qubit operators\nare mapped to full-register observables; 4) generalization of QCNN models\nstrongly depends on the embedding type, and that rotation-based feature maps\nwith the Fourier basis require careful feature engineering; 5) accuracy and\ngeneralization of QCNNs with readout based on a limited number of shots favor\nthe ground state embeddings and associated physics-informed models. We\ndemonstrate these points in simulation, where our results shed light on\nclassification for physical processes, relevant for applications in sensing.\nFinally, we show that QCNNs with properly chosen ground state embeddings can be\nused for fluid dynamics problems, expressing shock wave solutions with good\ngeneralization and proven trainability.\n","authors":["Chukwudubem Umeano","Annie E. Paine","Vincent E. Elfving","Oleksandr Kyriienko"],"pdf_url":"https://arxiv.org/pdf/2308.16664v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.15566v3","updated":"2024-07-05T01:06:33Z","published":"2023-12-24T23:34:01Z","title":"Deep Copula-Based Survival Analysis for Dependent Censoring with\n Identifiability Guarantees","summary":" Censoring is the central problem in survival analysis where either the\ntime-to-event (for instance, death), or the time-tocensoring (such as loss of\nfollow-up) is observed for each sample. The majority of existing machine\nlearning-based survival analysis methods assume that survival is conditionally\nindependent of censoring given a set of covariates; an assumption that cannot\nbe verified since only marginal distributions is available from the data. The\nexistence of dependent censoring, along with the inherent bias in current\nestimators has been demonstrated in a variety of applications, accentuating the\nneed for a more nuanced approach. However, existing methods that adjust for\ndependent censoring require practitioners to specify the ground truth copula.\nThis requirement poses a significant challenge for practical applications, as\nmodel misspecification can lead to substantial bias. In this work, we propose a\nflexible deep learning-based survival analysis method that simultaneously\naccommodate for dependent censoring and eliminates the requirement for\nspecifying the ground truth copula. We theoretically prove the identifiability\nof our model under a broad family of copulas and survival distributions.\nExperiments results from a wide range of datasets demonstrate that our approach\nsuccessfully discerns the underlying dependency structure and significantly\nreduces survival estimation bias when compared to existing methods.\n","authors":["Weijia Zhang","Chun Kai Ling","Xuanhui Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.15566v3.pdf","comment":"To appear in AAAI 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.04697v1","updated":"2024-07-05T17:59:02Z","published":"2024-07-05T17:59:02Z","title":"VCoME: Verbal Video Composition with Multimodal Editing Effects","summary":" Verbal videos, featuring voice-overs or text overlays, provide valuable\ncontent but present significant challenges in composition, especially when\nincorporating editing effects to enhance clarity and visual appeal. In this\npaper, we introduce the novel task of verbal video composition with editing\neffects. This task aims to generate coherent and visually appealing verbal\nvideos by integrating multimodal editing effects across textual, visual, and\naudio categories. To achieve this, we curate a large-scale dataset of video\neffects compositions from publicly available sources. We then formulate this\ntask as a generative problem, involving the identification of appropriate\npositions in the verbal content and the recommendation of editing effects for\nthese positions. To address this task, we propose VCoME, a general framework\nthat employs a large multimodal model to generate editing effects for video\ncomposition. Specifically, VCoME takes in the multimodal video context and\nautoregressively outputs where to apply effects within the verbal content and\nwhich effects are most appropriate for each position. VCoME also supports\nprompt-based control of composition density and style, providing substantial\nflexibility for diverse applications. Through extensive quantitative and\nqualitative evaluations, we clearly demonstrate the effectiveness of VCoME. A\ncomprehensive user study shows that our method produces videos of professional\nquality while being 85$\\times$ more efficient than professional editors.\n","authors":["Weibo Gong","Xiaojie Jin","Xin Li","Dongliang He","Xinglong Wu"],"pdf_url":"https://arxiv.org/pdf/2407.04697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18991v2","updated":"2024-07-05T13:01:07Z","published":"2024-05-29T11:11:07Z","title":"EasyAnimate: A High-Performance Long Video Generation Method based on\n Transformer Architecture","summary":" This paper presents EasyAnimate, an advanced method for video generation that\nleverages the power of transformer architecture for high-performance outcomes.\nWe have expanded the DiT framework originally designed for 2D image synthesis\nto accommodate the complexities of 3D video generation by incorporating a\nmotion module block. It is used to capture temporal dynamics, thereby ensuring\nthe production of consistent frames and seamless motion transitions. The motion\nmodule can be adapted to various DiT baseline methods to generate video with\ndifferent styles. It can also generate videos with different frame rates and\nresolutions during both training and inference phases, suitable for both images\nand videos. Moreover, we introduce slice VAE, a novel approach to condense the\ntemporal axis, facilitating the generation of long duration videos. Currently,\nEasyAnimate exhibits the proficiency to generate videos with 144 frames. We\nprovide a holistic ecosystem for video production based on DiT, encompassing\naspects such as data pre-processing, VAE training, DiT models training (both\nthe baseline model and LoRA model), and end-to-end video inference. Code is\navailable at: https://github.com/aigc-apps/EasyAnimate. We are continuously\nworking to enhance the performance of our method.\n","authors":["Jiaqi Xu","Xinyi Zou","Kunzhe Huang","Yunkuo Chen","Bo Liu","MengLi Cheng","Xing Shi","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2405.18991v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.04416v1","updated":"2024-07-05T11:07:13Z","published":"2024-07-05T11:07:13Z","title":"Improving Audio Generation with Visual Enhanced Caption","summary":" Generative models have shown significant achievements in audio generation\ntasks. However, existing models struggle with complex and detailed prompts,\nleading to potential performance degradation. We hypothesize that this problem\nstems from the low quality and relatively small quantity of training data. In\nthis work, we aim to create a large-scale audio dataset with rich captions for\nimproving audio generation models. We develop an automated pipeline to generate\ndetailed captions for audio-visual datasets by transforming predicted visual\ncaptions, audio captions, and tagging labels into comprehensive descriptions\nusing a Large Language Model (LLM). We introduce Sound-VECaps, a dataset\ncomprising 1.66M high-quality audio-caption pairs with enriched details\nincluding audio event orders, occurred places and environment information. We\ndemonstrate that training with Sound-VECaps significantly enhances the\ncapability of text-to-audio generation models to comprehend and generate audio\nfrom complex input prompts, improving overall system performance. Furthermore,\nwe conduct ablation studies of Sound-VECaps across several audio-language\ntasks, suggesting its potential in advancing audio-text representation\nlearning. Our dataset and models are available online.\n","authors":["Yi Yuan","Dongya Jia","Xiaobin Zhuang","Yuanzhe Chen","Zhengxi Liu","Zhuo Chen","Yuping Wang","Yuxuan Wang","Xubo Liu","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04416v1.pdf","comment":"5 pages with 1 appendix"},{"id":"http://arxiv.org/abs/2407.04284v1","updated":"2024-07-05T06:32:52Z","published":"2024-07-05T06:32:52Z","title":"TSC-PCAC: Voxel Transformer and Sparse Convolution Based Point Cloud\n Attribute Compression for 3D Broadcasting","summary":" Point cloud has been the mainstream representation for advanced 3D\napplications, such as virtual reality and augmented reality. However, the\nmassive data amounts of point clouds is one of the most challenging issues for\ntransmission and storage. In this paper, we propose an end-to-end voxel\nTransformer and Sparse Convolution based Point Cloud Attribute Compression\n(TSC-PCAC) for 3D broadcasting. Firstly, we present a framework of the\nTSC-PCAC, which include Transformer and Sparse Convolutional Module (TSCM)\nbased variational autoencoder and channel context module. Secondly, we propose\na two-stage TSCM, where the first stage focuses on modeling local dependencies\nand feature representations of the point clouds, and the second stage captures\nglobal features through spatial and channel pooling encompassing larger\nreceptive fields. This module effectively extracts global and local interpoint\nrelevance to reduce informational redundancy. Thirdly, we design a TSCM based\nchannel context module to exploit interchannel correlations, which improves the\npredicted probability distribution of quantized latent representations and thus\nreduces the bitrate. Experimental results indicate that the proposed TSC-PCAC\nmethod achieves an average of 38.53%, 21.30%, and 11.19% Bjontegaard Delta\nbitrate reductions compared to the Sparse-PCAC, NF-PCAC, and G-PCC v23 methods,\nrespectively. The encoding/decoding time costs are reduced up to 97.68%/98.78%\non average compared to the Sparse-PCAC. The source code and the trained models\nof the TSC-PCAC are available at https://github.com/igizuxo/TSC-PCAC.\n","authors":["Zixi Guo","Yun Zhang","Linwei Zhu","Hanli Wang","Gangyi Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.04284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04258v1","updated":"2024-07-05T05:08:06Z","published":"2024-07-05T05:08:06Z","title":"Unsupervised Video Summarization via Reinforcement Learning and a\n Trained Evaluator","summary":" This paper presents a novel approach for unsupervised video summarization\nusing reinforcement learning. It aims to address the existing limitations of\ncurrent unsupervised methods, including unstable training of adversarial\ngenerator-discriminator architectures and reliance on hand-crafted reward\nfunctions for quality evaluation. The proposed method is based on the concept\nthat a concise and informative summary should result in a reconstructed video\nthat closely resembles the original. The summarizer model assigns an importance\nscore to each frame and generates a video summary. In the proposed scheme,\nreinforcement learning, coupled with a unique reward generation pipeline, is\nemployed to train the summarizer model. The reward generation pipeline trains\nthe summarizer to create summaries that lead to improved reconstructions. It\ncomprises a generator model capable of reconstructing masked frames from a\npartially masked video, along with a reward mechanism that compares the\nreconstructed video from the summary against the original. The video generator\nis trained in a self-supervised manner to reconstruct randomly masked frames,\nenhancing its ability to generate accurate summaries. This training pipeline\nresults in a summarizer model that better mimics human-generated video\nsummaries compared to methods relying on hand-crafted rewards. The training\nprocess consists of two stable and isolated training steps, unlike adversarial\narchitectures. Experimental results demonstrate promising performance, with\nF-scores of 62.3 and 54.5 on TVSum and SumMe datasets, respectively.\nAdditionally, the inference stage is 300 times faster than our previously\nreported state-of-the-art method.\n","authors":["Mehryar Abbasi","Hadi Hadizadeh","Parvaneh Saeedi"],"pdf_url":"https://arxiv.org/pdf/2407.04258v1.pdf","comment":null}]},"2024-07-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2402.00371v2","updated":"2024-07-04T23:37:40Z","published":"2024-02-01T06:21:19Z","title":"What Does the Bot Say? Opportunities and Risks of Large Language Models\n in Social Media Bot Detection","summary":" Social media bot detection has always been an arms race between advancements\nin machine learning bot detectors and adversarial bot strategies to evade\ndetection. In this work, we bring the arms race to the next level by\ninvestigating the opportunities and risks of state-of-the-art large language\nmodels (LLMs) in social bot detection. To investigate the opportunities, we\ndesign novel LLM-based bot detectors by proposing a\nmixture-of-heterogeneous-experts framework to divide and conquer diverse user\ninformation modalities. To illuminate the risks, we explore the possibility of\nLLM-guided manipulation of user textual and structured information to evade\ndetection. Extensive experiments with three LLMs on two datasets demonstrate\nthat instruction tuning on merely 1,000 annotated examples produces specialized\nLLMs that outperform state-of-the-art baselines by up to 9.1% on both datasets,\nwhile LLM-guided manipulation strategies could significantly bring down the\nperformance of existing bot detectors by up to 29.6% and harm the calibration\nand reliability of bot detection systems.\n","authors":["Shangbin Feng","Herun Wan","Ningnan Wang","Zhaoxuan Tan","Minnan Luo","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2402.00371v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.04185v1","updated":"2024-07-04T23:26:56Z","published":"2024-07-04T23:26:56Z","title":"HAF-RM: A Hybrid Alignment Framework for Reward Model Training","summary":" The reward model has become increasingly important in alignment, assessment,\nand data construction for large language models (LLMs). Most existing\nresearchers focus on enhancing reward models through data improvements,\nfollowing the conventional training framework for reward models that directly\noptimizes the predicted rewards. In this paper, we propose a hybrid alignment\nframework HaF-RM for reward model training by introducing an additional\nconstraint on token-level policy probabilities in addition to the reward score.\nIt can simultaneously supervise the internal preference model at the token\nlevel and optimize the mapping layer of the reward model at the sequence level.\nTheoretical justifications and experiment results on five datasets show the\nvalidity and effectiveness of our proposed hybrid framework for training a\nhigh-quality reward model. By decoupling the reward modeling procedure and\nincorporating hybrid supervision, our HaF-RM framework offers a principled and\neffective approach to enhancing the performance and alignment of reward models,\na critical component in the responsible development of powerful language\nmodels. We release our code at https://haf-rm.github.io.\n","authors":["Shujun Liu","Xiaoyu Shen","Yuhang Lai","Siyuan Wang","Shengbin Yue","Zengfeng Huang","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2407.04185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04183v1","updated":"2024-07-04T23:05:58Z","published":"2024-07-04T23:05:58Z","title":"Seeing Like an AI: How LLMs Apply (and Misapply) Wikipedia Neutrality\n Norms","summary":" Large language models (LLMs) are trained on broad corpora and then used in\ncommunities with specialized norms. Is providing LLMs with community rules\nenough for models to follow these norms? We evaluate LLMs' capacity to detect\n(Task 1) and correct (Task 2) biased Wikipedia edits according to Wikipedia's\nNeutral Point of View (NPOV) policy. LLMs struggled with bias detection,\nachieving only 64% accuracy on a balanced dataset. Models exhibited contrasting\nbiases (some under- and others over-predicted bias), suggesting distinct priors\nabout neutrality. LLMs performed better at generation, removing 79% of words\nremoved by Wikipedia editors. However, LLMs made additional changes beyond\nWikipedia editors' simpler neutralizations, resulting in high-recall but\nlow-precision editing. Interestingly, crowdworkers rated AI rewrites as more\nneutral (70%) and fluent (61%) than Wikipedia-editor rewrites. Qualitative\nanalysis found LLMs sometimes applied NPOV more comprehensively than Wikipedia\neditors but often made extraneous non-NPOV-related changes (such as grammar).\nLLMs may apply rules in ways that resonate with the public but diverge from\ncommunity experts. While potentially effective for generation, LLMs may reduce\neditor agency and increase moderation workload (e.g., verifying additions).\nEven when rules are easy to articulate, having LLMs apply them like community\nmembers may still be difficult.\n","authors":["Joshua Ashkinaze","Ruijia Guan","Laura Kurek","Eytan Adar","Ceren Budak","Eric Gilbert"],"pdf_url":"https://arxiv.org/pdf/2407.04183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04181v1","updated":"2024-07-04T22:55:02Z","published":"2024-07-04T22:55:02Z","title":"Orchestrating LLMs with Different Personalizations","summary":" This paper presents a novel approach to aligning large language models (LLMs)\nwith individual human preferences, sometimes referred to as Reinforcement\nLearning from \\textit{Personalized} Human Feedback (RLPHF). Given stated\npreferences along multiple dimensions, such as helpfulness, conciseness, or\nhumor, the goal is to create an LLM without re-training that best adheres to\nthis specification. Starting from specialized expert LLMs, each trained for one\nsuch particular preference dimension, we propose a black-box method that merges\ntheir outputs on a per-token level. We train a lightweight Preference Control\nModel (PCM) that dynamically translates the preference description and current\ncontext into next-token prediction weights. By combining the expert models'\noutputs at the token level, our approach dynamically generates text that\noptimizes the given preference. Empirical tests show that our method matches or\nsurpasses existing preference merging techniques, providing a scalable,\nefficient alternative to fine-tuning LLMs for individual personalization.\n","authors":["Jin Peng Zhou","Katie Z Luo","Jingwen Gu","Jason Yuan","Kilian Q. Weinberger","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2407.04181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04179v1","updated":"2024-07-04T22:48:57Z","published":"2024-07-04T22:48:57Z","title":"Defense Against Syntactic Textual Backdoor Attacks with Token\n Substitution","summary":" Textual backdoor attacks present a substantial security risk to Large\nLanguage Models (LLM). It embeds carefully chosen triggers into a victim model\nat the training stage, and makes the model erroneously predict inputs\ncontaining the same triggers as a certain class. Prior backdoor defense methods\nprimarily target special token-based triggers, leaving syntax-based triggers\ninsufficiently addressed. To fill this gap, this paper proposes a novel online\ndefense algorithm that effectively counters syntax-based as well as special\ntoken-based backdoor attacks. The algorithm replaces semantically meaningful\nwords in sentences with entirely different ones but preserves the syntactic\ntemplates or special tokens, and then compares the predicted labels before and\nafter the substitution to determine whether a sentence contains triggers.\nExperimental results confirm the algorithm's performance against these two\ntypes of triggers, offering a comprehensive defense strategy for model\nintegrity.\n","authors":["Xinglin Li","Xianwen He","Yao Li","Minhao Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.04179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02514v2","updated":"2024-07-04T21:49:07Z","published":"2024-06-22T12:50:41Z","title":"LOGIC-LM++: Multi-Step Refinement for Symbolic Formulations","summary":" In this paper we examine the limitations of Large Language Models (LLMs) for\ncomplex reasoning tasks. Although recent works have started to employ formal\nlanguages as an intermediate representation for reasoning tasks, they often\nface challenges in accurately generating and refining these formal\nspecifications to ensure correctness. To address these issues, this paper\nproposes Logic-LM++, an improvement on Logic-LM . It uses the ability of LLMs\nto do pairwise comparisons, allowing the evaluation of the refinements\nsuggested by the LLM. The paper demonstrates that Logic-LM++ outperforms\nLogic-LM and other contemporary techniques across natural language reasoning\ntasks on three datasets, FOLIO, ProofWriter and AR-LSAT, with an average\nimprovement of 18.5% on standard prompting, 12.3% on chain of thought prompting\nand 5% on Logic-LM.\n","authors":["Shashank Kirtania","Priyanshu Gupta","Arjun Radhakirshna"],"pdf_url":"https://arxiv.org/pdf/2407.02514v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04158v1","updated":"2024-07-04T21:23:18Z","published":"2024-07-04T21:23:18Z","title":"ELCC: the Emergent Language Corpus Collection","summary":" We introduce the Emergent Language Corpus Collection (ELCC): a collection of\ncorpora collected from open source implementations of emergent communication\nsystems across the literature. These systems include a variety of signalling\ngame environments as well as more complex tasks like a social deduction game\nand embodied navigation. Each corpus is annotated with metadata describing the\ncharacteristics of the source system as well as a suite of analyses of the\ncorpus (e.g., size, entropy, average message length). Currently, research\nstudying emergent languages requires directly running different systems which\ntakes time away from actual analyses of such languages, limits the variety of\nlanguages that are studied, and presents a barrier to entry for researchers\nwithout a background in deep learning. The availability of a substantial\ncollection of well-documented emergent language corpora, then, will enable new\ndirections of research which focus their purview on the properties of emergent\nlanguages themselves rather than on experimental apparatus.\n","authors":["Brendon Boldt","David Mortensen"],"pdf_url":"https://arxiv.org/pdf/2407.04158v1.pdf","comment":"18 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.04151v1","updated":"2024-07-04T20:57:06Z","published":"2024-07-04T20:57:06Z","title":"Securing Multi-turn Conversational Language Models Against Distributed\n Backdoor Triggers","summary":" The security of multi-turn conversational large language models (LLMs) is\nunderstudied despite it being one of the most popular LLM utilization.\nSpecifically, LLMs are vulnerable to data poisoning backdoor attacks, where an\nadversary manipulates the training data to cause the model to output malicious\nresponses to predefined triggers. Specific to the multi-turn dialogue setting,\nLLMs are at the risk of even more harmful and stealthy backdoor attacks where\nthe backdoor triggers may span across multiple utterances, giving lee-way to\ncontext-driven attacks. In this paper, we explore a novel distributed backdoor\ntrigger attack that serves to be an extra tool in an adversary's toolbox that\ncan interface with other single-turn attack strategies in a plug and play\nmanner. Results on two representative defense mechanisms indicate that\ndistributed backdoor triggers are robust against existing defense strategies\nwhich are designed for single-turn user-model interactions, motivating us to\npropose a new defense strategy for the multi-turn dialogue setting that is more\nchallenging. To this end, we also explore a novel contrastive decoding based\ndefense that is able to mitigate the backdoor with a low computational\ntradeoff.\n","authors":["Terry Tong","Jiashu Xu","Qin Liu","Muhao Chen"],"pdf_url":"https://arxiv.org/pdf/2407.04151v1.pdf","comment":"Submitted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2407.04130v1","updated":"2024-07-04T19:16:44Z","published":"2024-07-04T19:16:44Z","title":"Towards Automating Text Annotation: A Case Study on Semantic Proximity\n Annotation using GPT-4","summary":" This paper explores using GPT-3.5 and GPT-4 to automate the data annotation\nprocess with automatic prompting techniques. The main aim of this paper is to\nreuse human annotation guidelines along with some annotated data to design\nautomatic prompts for LLMs, focusing on the semantic proximity annotation task.\nAutomatic prompts are compared to customized prompts. We further implement the\nprompting strategies into an open-source text annotation tool, enabling easy\nonline use via the OpenAI API. Our study reveals the crucial role of accurate\nprompt design and suggests that prompting GPT-4 with human-like instructions is\nnot straightforwardly possible for the semantic proximity task. We show that\nsmall modifications to the human guidelines already improve the performance,\nsuggesting possible ways for future research.\n","authors":["Sachin Yadav","Tejaswi Choppa","Dominik Schlechtweg"],"pdf_url":"https://arxiv.org/pdf/2407.04130v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2407.04125v1","updated":"2024-07-04T18:54:30Z","published":"2024-07-04T18:54:30Z","title":"Query-Guided Self-Supervised Summarization of Nursing Notes","summary":" Nursing notes, an important component of Electronic Health Records (EHRs),\nkeep track of the progression of a patient's health status during a care\nepisode. Distilling the key information in nursing notes through text\nsummarization techniques can improve clinicians' efficiency in understanding\npatients' conditions when reviewing nursing notes. However, existing\nabstractive summarization methods in the clinical setting have often overlooked\nnursing notes and require the creation of reference summaries for supervision\nsignals, which is time-consuming. In this work, we introduce QGSumm, a\nquery-guided self-supervised domain adaptation framework for nursing note\nsummarization. Using patient-related clinical queries as guidance, our approach\ngenerates high-quality, patient-centered summaries without relying on reference\nsummaries for training. Through automatic and manual evaluation by an expert\nclinician, we demonstrate the strengths of our approach compared to the\nstate-of-the-art Large Language Models (LLMs) in both zero-shot and few-shot\nsettings. Ultimately, our approach provides a new perspective on conditional\ntext summarization, tailored to the specific interests of clinical personnel.\n","authors":["Ya Gao","Hans Moen","Saila Koivusalo","Miika Koskinen","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2407.04125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15294v2","updated":"2024-07-04T18:51:43Z","published":"2024-06-21T16:38:22Z","title":"NLP-KG: A System for Exploratory Search of Scientific Literature in\n Natural Language Processing","summary":" Scientific literature searches are often exploratory, whereby users are not\nyet familiar with a particular field or concept but are interested in learning\nmore about it. However, existing systems for scientific literature search are\ntypically tailored to keyword-based lookup searches, limiting the possibilities\nfor exploration. We propose NLP-KG, a feature-rich system designed to support\nthe exploration of research literature in unfamiliar natural language\nprocessing (NLP) fields. In addition to a semantic search, NLP-KG allows users\nto easily find survey papers that provide a quick introduction to a field of\ninterest. Further, a Fields of Study hierarchy graph enables users to\nfamiliarize themselves with a field and its related areas. Finally, a chat\ninterface allows users to ask questions about unfamiliar concepts or specific\narticles in NLP and obtain answers grounded in knowledge retrieved from\nscientific publications. Our system provides users with comprehensive\nexploration possibilities, supporting them in investigating the relationships\nbetween different fields, understanding unfamiliar concepts in NLP, and finding\nrelevant research literature. Demo, video, and code are available at:\nhttps://github.com/NLP-Knowledge-Graph/NLP-KG-WebApp.\n","authors":["Tim Schopf","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2406.15294v2.pdf","comment":"Accepted to ACL 2024 System Demonstrations"},{"id":"http://arxiv.org/abs/2407.04121v1","updated":"2024-07-04T18:47:42Z","published":"2024-07-04T18:47:42Z","title":"Hallucination Detection: Robustly Discerning Reliable Answers in Large\n Language Models","summary":" Large Language Models (LLMs) have gained widespread adoption in various\nnatural language processing tasks, including question answering and dialogue\nsystems. However, a major drawback of LLMs is the issue of hallucination, where\nthey generate unfaithful or inconsistent content that deviates from the input\nsource, leading to severe consequences. In this paper, we propose a robust\ndiscriminator named RelD to effectively detect hallucination in LLMs' generated\nanswers. RelD is trained on the constructed RelQA, a bilingual\nquestion-answering dialogue dataset along with answers generated by LLMs and a\ncomprehensive set of metrics. Our experimental results demonstrate that the\nproposed RelD successfully detects hallucination in the answers generated by\ndiverse LLMs. Moreover, it performs well in distinguishing hallucination in\nLLMs' generated answers from both in-distribution and out-of-distribution\ndatasets. Additionally, we also conduct a thorough analysis of the types of\nhallucinations that occur and present valuable insights. This research\nsignificantly contributes to the detection of reliable answers generated by\nLLMs and holds noteworthy implications for mitigating hallucination in the\nfuture work.\n","authors":["Yuyan Chen","Qiang Fu","Yichen Yuan","Zhihao Wen","Ge Fan","Dayiheng Liu","Dongmei Zhang","Zhixu Li","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.04121v1.pdf","comment":"Accepted to CIKM 2023 (Long Paper)"},{"id":"http://arxiv.org/abs/2407.04118v1","updated":"2024-07-04T18:39:59Z","published":"2024-07-04T18:39:59Z","title":"MAPO: Boosting Large Language Model Performance with Model-Adaptive\n Prompt Optimization","summary":" Prompt engineering, as an efficient and effective way to leverage Large\nLanguage Models (LLM), has drawn a lot of attention from the research\ncommunity. The existing research primarily emphasizes the importance of\nadapting prompts to specific tasks, rather than specific LLMs. However, a good\nprompt is not solely defined by its wording, but also binds to the nature of\nthe LLM in question. In this work, we first quantitatively demonstrate that\ndifferent prompts should be adapted to different LLMs to enhance their\ncapabilities across various downstream tasks in NLP. Then we novelly propose a\nmodel-adaptive prompt optimizer (MAPO) method that optimizes the original\nprompts for each specific LLM in downstream tasks. Extensive experiments\nindicate that the proposed method can effectively refine prompts for an LLM,\nleading to significant improvements over various downstream tasks.\n","authors":["Yuyan Chen","Zhihao Wen","Ge Fan","Zhengyu Chen","Wei Wu","Dayiheng Liu","Zhixu Li","Bang Liu","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.04118v1.pdf","comment":"Accepted to EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2402.12354v2","updated":"2024-07-04T18:33:00Z","published":"2024-02-19T18:33:49Z","title":"LoRA+: Efficient Low Rank Adaptation of Large Models","summary":" In this paper, we show that Low Rank Adaptation (LoRA) as originally\nintroduced in Hu et al. (2021) leads to suboptimal finetuning of models with\nlarge width (embedding dimension). This is due to the fact that adapter\nmatrices A and B in LoRA are updated with the same learning rate. Using scaling\narguments for large width networks, we demonstrate that using the same learning\nrate for A and B does not allow efficient feature learning. We then show that\nthis suboptimality of LoRA can be corrected simply by setting different\nlearning rates for the LoRA adapter matrices A and B with a well-chosen ratio.\nWe call this proposed algorithm LoRA$+$. In our extensive experiments, LoRA$+$\nimproves performance (1-2 $\\%$ improvements) and finetuning speed (up to $\\sim$\n2X SpeedUp), at the same computational cost as LoRA.\n","authors":["Soufiane Hayou","Nikhil Ghosh","Bin Yu"],"pdf_url":"https://arxiv.org/pdf/2402.12354v2.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2407.04106v1","updated":"2024-07-04T18:21:10Z","published":"2024-07-04T18:21:10Z","title":"MiniGPT-Med: Large Language Model as a General Interface for Radiology\n Diagnosis","summary":" Recent advancements in artificial intelligence (AI) have precipitated\nsignificant breakthroughs in healthcare, particularly in refining diagnostic\nprocedures. However, previous studies have often been constrained to limited\nfunctionalities. This study introduces MiniGPT-Med, a vision-language model\nderived from large-scale language models and tailored for medical applications.\nMiniGPT-Med demonstrates remarkable versatility across various imaging\nmodalities, including X-rays, CT scans, and MRIs, enhancing its utility. The\nmodel is capable of performing tasks such as medical report generation, visual\nquestion answering (VQA), and disease identification within medical imagery.\nIts integrated processing of both image and textual clinical data markedly\nimproves diagnostic accuracy. Our empirical assessments confirm MiniGPT-Med's\nsuperior performance in disease grounding, medical report generation, and VQA\nbenchmarks, representing a significant step towards reducing the gap in\nassisting radiology practice. Furthermore, it achieves state-of-the-art\nperformance on medical report generation, higher than the previous best model\nby 19\\% accuracy. MiniGPT-Med promises to become a general interface for\nradiology diagnoses, enhancing diagnostic efficiency across a wide range of\nmedical imaging applications.\n","authors":["Asma Alkhaldi","Raneem Alnajim","Layan Alabdullatef","Rawan Alyahya","Jun Chen","Deyao Zhu","Ahmed Alsinan","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2407.04106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04105v1","updated":"2024-07-04T18:13:38Z","published":"2024-07-04T18:13:38Z","title":"Can Pre-trained Language Models Understand Chinese Humor?","summary":" Humor understanding is an important and challenging research in natural\nlanguage processing. As the popularity of pre-trained language models (PLMs),\nsome recent work makes preliminary attempts to adopt PLMs for humor recognition\nand generation. However, these simple attempts do not substantially answer the\nquestion: {\\em whether PLMs are capable of humor understanding?} This paper is\nthe first work that systematically investigates the humor understanding ability\nof PLMs. For this purpose, a comprehensive framework with three evaluation\nsteps and four evaluation tasks is designed. We also construct a comprehensive\nChinese humor dataset, which can fully meet all the data requirements of the\nproposed evaluation framework. Our empirical study on the Chinese humor dataset\nyields some valuable observations, which are of great guiding value for future\noptimization of PLMs in humor understanding and generation.\n","authors":["Yuyan Chen","Zhixu Li","Jiaqing Liang","Yanghua Xiao","Bang Liu","Yunwen Chen"],"pdf_url":"https://arxiv.org/pdf/2407.04105v1.pdf","comment":"Accepted to WSDM 2022"},{"id":"http://arxiv.org/abs/2401.17169v2","updated":"2024-07-04T18:12:25Z","published":"2024-01-30T16:56:54Z","title":"Conditional and Modal Reasoning in Large Language Models","summary":" The reasoning abilities of large language models (LLMs) are the topic of a\ngrowing body of research in AI and cognitive science. In this paper, we probe\nthe extent to which twenty-five LLMs are able to distinguish logically correct\ninferences from logically fallacious ones. We focus on inference patterns\ninvolving conditionals (e.g., 'If Ann has a queen, then Bob has a jack') and\nepistemic modals (e.g., 'Ann might have an ace', 'Bob must have a king'). These\ninferences have been of special interest to logicians, philosophers, and\nlinguists, since they play a central role in the fundamental human ability to\nreason about distal possibilities. Assessing LLMs on these inferences is thus\nhighly relevant to the question of how much the reasoning abilities of LLMs\nmatch those of humans. Among the LLMs we tested, all but the GPT-4 model family\noften make basic mistakes with conditionals, though zero-shot chain-of-thought\nprompting helps them make fewer mistakes. Moreover, even the GPT-4 family\ndisplays logically inconsistent judgments across inference patterns involving\nepistemic modals, and almost all models give answers to certain complex\nconditional inferences widely discussed in the literature that do not match\nhuman judgments. These results highlight gaps in basic logical reasoning in\ntoday's LLMs.\n","authors":["Wesley H. Holliday","Matthew Mandelkern","Cedegao E. Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17169v2.pdf","comment":"Updated version with results from 25 LLMs, additional few-shot and\n chain-of-thought prompts, additional inference patterns, and correlations\n with other benchmarks"},{"id":"http://arxiv.org/abs/2407.04093v1","updated":"2024-07-04T17:59:41Z","published":"2024-07-04T17:59:41Z","title":"Stephanie: Step-by-Step Dialogues for Mimicking Human Interactions in\n Social Conversations","summary":" In the rapidly evolving field of natural language processing, dialogue\nsystems primarily employ a single-step dialogue paradigm. Although this\nparadigm is efficient, it lacks the depth and fluidity of human interactions\nand does not appear natural. We introduce a novel \\textbf{Step}-by-Step\nDialogue Paradigm (Stephanie), designed to mimic the ongoing dynamic nature of\nhuman conversations. By employing a dual learning strategy and a further-split\npost-editing method, we generated and utilized a high-quality step-by-step\ndialogue dataset to fine-tune existing large language models, enabling them to\nperform step-by-step dialogues. We thoroughly present Stephanie. Tailored\nautomatic and human evaluations are conducted to assess its effectiveness\ncompared to the traditional single-step dialogue paradigm. We will release\ncode, Stephanie datasets, and Stephanie LLMs to facilitate the future of\nchatbot eras.\n","authors":["Hao Yang","Hongyuan Lu","Xinhua Zeng","Yang Liu","Xiang Zhang","Haoran Yang","Yumeng Zhang","Yiran Wei","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2407.04093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04079v1","updated":"2024-07-04T17:41:32Z","published":"2024-07-04T17:41:32Z","title":"AXOLOTL'24 Shared Task on Multilingual Explainable Semantic Change\n Modeling","summary":" This paper describes the organization and findings of AXOLOTL'24, the first\nmultilingual explainable semantic change modeling shared task. We present new\nsense-annotated diachronic semantic change datasets for Finnish and Russian\nwhich were employed in the shared task, along with a surprise test-only German\ndataset borrowed from an existing source. The setup of AXOLOTL'24 is new to the\nsemantic change modeling field, and involves subtasks of identifying unknown\n(novel) senses and providing dictionary-like definitions to these senses. The\nmethods of the winning teams are described and compared, thus paving a path\ntowards explainability in computational approaches to historical change of\nmeaning.\n","authors":["Mariia Fedorova","Timothee Mickus","Niko Partanen","Janine Siewert","Elena Spaziani","Andrey Kutuzov"],"pdf_url":"https://arxiv.org/pdf/2407.04079v1.pdf","comment":"Proceedings of the 5th Workshop on Computational Approaches to\n Historical Language Change (ACL'24)"},{"id":"http://arxiv.org/abs/2407.04078v1","updated":"2024-07-04T17:39:16Z","published":"2024-07-04T17:39:16Z","title":"DotaMath: Decomposition of Thought with Code Assistance and\n Self-correction for Mathematical Reasoning","summary":" Large language models (LLMs) have made impressive progress in handling simple\nmath problems, yet they still struggle with more challenging and complex\nmathematical tasks. In this paper, we introduce a series of LLMs that employs\nthe Decomposition of thought with code assistance and self-correction for\nmathematical reasoning, dubbed as DotaMath. DotaMath models tackle complex\nmathematical tasks by decomposing them into simpler logical subtasks,\nleveraging code to solve these subtasks, obtaining fine-grained feedback from\nthe code interpreter, and engaging in self-reflection and correction. By\nannotating diverse interactive tool-use trajectories and employing query\nevolution on GSM8K and MATH datasets, we generate an instruction fine-tuning\ndataset called DotaMathQA with 574K query-response pairs. We train a series of\nbase LLMs using imitation learning on DotaMathQA, resulting in DotaMath models\nthat achieve remarkable performance compared to open-source LLMs across various\nin-domain and out-of-domain benchmarks. Notably, DotaMath-deepseek-7B showcases\nan outstanding performance of 64.8% on the competitive MATH dataset and 86.7%\non GSM8K. Besides, DotaMath-deepseek-7B maintains strong competitiveness on a\nseries of in-domain and out-of-domain benchmarks (Avg. 80.1%). Looking forward,\nwe anticipate that the DotaMath paradigm will open new pathways for addressing\nintricate mathematical problems. Our code is publicly available at\nhttps://github.com/ChengpengLi1003/DotaMath.\n","authors":["Chengpeng Li","Guanting Dong","Mingfeng Xue","Ru Peng","Xiang Wang","Dayiheng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.04078v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2403.00791v2","updated":"2024-07-04T17:21:48Z","published":"2024-02-22T20:11:24Z","title":"L+M-24: Building a Dataset for Language + Molecules @ ACL 2024","summary":" Language-molecule models have emerged as an exciting direction for molecular\ndiscovery and understanding. However, training these models is challenging due\nto the scarcity of molecule-language pair datasets. At this point, datasets\nhave been released which are 1) small and scraped from existing databases, 2)\nlarge but noisy and constructed by performing entity linking on the scientific\nliterature, and 3) built by converting property prediction datasets to natural\nlanguage using templates. In this document, we detail the $\\textit{L+M-24}$\ndataset, which has been created for the Language + Molecules Workshop shared\ntask at ACL 2024. In particular, $\\textit{L+M-24}$ is designed to focus on\nthree key benefits of natural language in molecule design: compositionality,\nfunctionality, and abstraction.\n","authors":["Carl Edwards","Qingyun Wang","Lawrence Zhao","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2403.00791v2.pdf","comment":"The dataset, finetuned baselines, and evaluation code are released\n publicly at https://github.com/language-plus-molecules/LPM-24-Dataset through\n https://huggingface.co/language-plus-molecules"},{"id":"http://arxiv.org/abs/2407.04069v1","updated":"2024-07-04T17:15:37Z","published":"2024-07-04T17:15:37Z","title":"A Systematic Survey and Critical Review on Evaluating Large Language\n Models: Challenges, Limitations, and Recommendations","summary":" Large Language Models (LLMs) have recently gained significant attention due\nto their remarkable capabilities in performing diverse tasks across various\ndomains. However, a thorough evaluation of these models is crucial before\ndeploying them in real-world applications to ensure they produce reliable\nperformance. Despite the well-established importance of evaluating LLMs in the\ncommunity, the complexity of the evaluation process has led to varied\nevaluation setups, causing inconsistencies in findings and interpretations. To\naddress this, we systematically review the primary challenges and limitations\ncausing these inconsistencies and unreliable evaluations in various steps of\nLLM evaluation. Based on our critical review, we present our perspectives and\nrecommendations to ensure LLM evaluations are reproducible, reliable, and\nrobust.\n","authors":["Md Tahmid Rahman Laskar","Sawsan Alqahtani","M Saiful Bari","Mizanur Rahman","Mohammad Abdullah Matin Khan","Haidar Khan","Israt Jahan","Amran Bhuiyan","Chee Wei Tan","Md Rizwan Parvez","Enamul Hoque","Shafiq Joty","Jimmy Huang"],"pdf_url":"https://arxiv.org/pdf/2407.04069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13481v2","updated":"2024-07-04T17:14:24Z","published":"2024-01-24T14:29:39Z","title":"How AI Ideas Affect the Creativity, Diversity, and Evolution of Human\n Ideas: Evidence From a Large, Dynamic Experiment","summary":" Exposure to large language model output is rapidly increasing. How will\nseeing AI-generated ideas affect human ideas? We conducted an experiment (800+\nparticipants, 40+ countries) where participants viewed creative ideas that were\nfrom ChatGPT or prior experimental participants and then brainstormed their own\nidea. We varied the number of AI-generated examples (none, low, or high\nexposure) and if the examples were labeled as 'AI' (disclosure). Our dynamic\nexperiment design -- ideas from prior participants in an experimental condition\nare used as stimuli for future participants in the same experimental condition\n-- speaks to the interdependent process of cultural creation: creative ideas\nare built upon prior ideas. Hence, we capture the compounding effects of having\nLLMs 'in the culture loop'. We find that high AI exposure (but not low AI\nexposure) did not affect the creativity of individual ideas but did increase\nthe average amount and rate of change of collective idea diversity. AI made\nideas different, not better. There were no main effects of disclosure. We also\nfound that self-reported creative people were less influenced by knowing an\nidea was from AI and that participants may knowingly adopt AI ideas when the\ntask is difficult. Our findings suggest that introducing AI ideas may increase\ncollective diversity but not individual creativity.\n","authors":["Joshua Ashkinaze","Julia Mendelsohn","Li Qiwei","Ceren Budak","Eric Gilbert"],"pdf_url":"https://arxiv.org/pdf/2401.13481v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04067v1","updated":"2024-07-04T17:13:38Z","published":"2024-07-04T17:13:38Z","title":"Semantic Graphs for Syntactic Simplification: A Revisit from the Age of\n LLM","summary":" Symbolic sentence meaning representations, such as AMR (Abstract Meaning\nRepresentation) provide expressive and structured semantic graphs that act as\nintermediates that simplify downstream NLP tasks. However, the\ninstruction-following capability of large language models (LLMs) offers a\nshortcut to effectively solve NLP tasks, questioning the utility of semantic\ngraphs. Meanwhile, recent work has also shown the difficulty of using meaning\nrepresentations merely as a helpful auxiliary for LLMs. We revisit the position\nof semantic graphs in syntactic simplification, the task of simplifying\nsentence structures while preserving their meaning, which requires semantic\nunderstanding, and evaluate it on a new complex and natural dataset. The\nAMR-based method that we propose, AMRS$^3$, demonstrates that state-of-the-art\nmeaning representations can lead to easy-to-implement simplification methods\nwith competitive performance and unique advantages in cost, interpretability,\nand generalization. With AMRS$^3$ as an anchor, we discover that syntactic\nsimplification is a task where semantic graphs are helpful in LLM prompting. We\npropose AMRCoC prompting that guides LLMs to emulate graph algorithms for\nexplicit symbolic reasoning on AMR graphs, and show its potential for improving\nLLM on semantic-centered tasks like syntactic simplification.\n","authors":["Peiran Yao","Kostyantyn Guzhva","Denilson Barbosa"],"pdf_url":"https://arxiv.org/pdf/2407.04067v1.pdf","comment":"Accepted at TextGraphs-17 @ ACL 2024"},{"id":"http://arxiv.org/abs/2407.03236v2","updated":"2024-07-04T17:06:33Z","published":"2024-07-03T16:05:20Z","title":"CATT: Character-based Arabic Tashkeel Transformer","summary":" Tashkeel, or Arabic Text Diacritization (ATD), greatly enhances the\ncomprehension of Arabic text by removing ambiguity and minimizing the risk of\nmisinterpretations caused by its absence. It plays a crucial role in improving\nArabic text processing, particularly in applications such as text-to-speech and\nmachine translation. This paper introduces a new approach to training ATD\nmodels. First, we finetuned two transformers, encoder-only and encoder-decoder,\nthat were initialized from a pretrained character-based BERT. Then, we applied\nthe Noisy-Student approach to boost the performance of the best model. We\nevaluated our models alongside 11 commercial and open-source models using two\nmanually labeled benchmark datasets: WikiNews and our CATT dataset. Our\nfindings show that our top model surpasses all evaluated models by relative\nDiacritic Error Rates (DERs) of 30.83\\% and 35.21\\% on WikiNews and CATT,\nrespectively, achieving state-of-the-art in ATD. In addition, we show that our\nmodel outperforms GPT-4-turbo on CATT dataset by a relative DER of 9.36\\%. We\nopen-source our CATT models and benchmark dataset for the research\ncommunity\\footnote{https://github.com/abjadai/catt}.\n","authors":["Faris Alasmary","Orjuwan Zaafarani","Ahmad Ghannam"],"pdf_url":"https://arxiv.org/pdf/2407.03236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04050v1","updated":"2024-07-04T16:48:14Z","published":"2024-07-04T16:48:14Z","title":"Deep Content Understanding Toward Entity and Aspect Target Sentiment\n Analysis on Foundation Models","summary":" Introducing Entity-Aspect Sentiment Triplet Extraction (EASTE), a novel\nAspect-Based Sentiment Analysis (ABSA) task which extends\nTarget-Aspect-Sentiment Detection (TASD) by separating aspect categories (e.g.,\nfood#quality) into pre-defined entities (e.g., meal, drink) and aspects (e.g.,\ntaste, freshness) which add a fine-gainer level of complexity, yet help\nexposing true sentiment of chained aspect to its entity. We explore the task of\nEASTE solving capabilities of language models based on transformers\narchitecture from our proposed unified-loss approach via token classification\ntask using BERT architecture to text generative models such as Flan-T5,\nFlan-Ul2 to Llama2, Llama3 and Mixtral employing different alignment techniques\nsuch as zero/few-shot learning, Parameter Efficient Fine Tuning (PEFT) such as\nLow-Rank Adaptation (LoRA). The model performances are evaluated on the\nSamEval-2016 benchmark dataset representing the fair comparison to existing\nworks. Our research not only aims to achieve high performance on the EASTE task\nbut also investigates the impact of model size, type, and adaptation techniques\non task performance. Ultimately, we provide detailed insights and achieving\nstate-of-the-art results in complex sentiment analysis.\n","authors":["Vorakit Vorakitphan","Milos Basic","Guilhaume Leroy Meline"],"pdf_url":"https://arxiv.org/pdf/2407.04050v1.pdf","comment":"Proceedings of the 41 st International Conference on Machine\n Learning, Vienna, Austria. PMLR 235, 2024. Copyright 2024 by the author(s)"},{"id":"http://arxiv.org/abs/2407.04047v1","updated":"2024-07-04T16:42:24Z","published":"2024-07-04T16:42:24Z","title":"Improving Accented Speech Recognition using Data Augmentation based on\n Unsupervised Text-to-Speech Synthesis","summary":" This paper investigates the use of unsupervised text-to-speech synthesis\n(TTS) as a data augmentation method to improve accented speech recognition. TTS\nsystems are trained with a small amount of accented speech training data and\ntheir pseudo-labels rather than manual transcriptions, and hence unsupervised.\nThis approach enables the use of accented speech data without manual\ntranscriptions to perform data augmentation for accented speech recognition.\nSynthetic accented speech data, generated from text prompts by using the TTS\nsystems, are then combined with available non-accented speech data to train\nautomatic speech recognition (ASR) systems. ASR experiments are performed in a\nself-supervised learning framework using a Wav2vec2.0 model which was\npre-trained on large amount of unsupervised accented speech data. The accented\nspeech data for training the unsupervised TTS are read speech, selected from\nL2-ARCTIC and British Isles corpora, while spontaneous conversational speech\nfrom the Edinburgh international accents of English corpus are used as the\nevaluation data. Experimental results show that Wav2vec2.0 models which are\nfine-tuned to downstream ASR task with synthetic accented speech data,\ngenerated by the unsupervised TTS, yield up to 6.1% relative word error rate\nreductions compared to a Wav2vec2.0 baseline which is fine-tuned with the\nnon-accented speech data from Librispeech corpus.\n","authors":["Cong-Thanh Do","Shuhei Imai","Rama Doddipatla","Thomas Hain"],"pdf_url":"https://arxiv.org/pdf/2407.04047v1.pdf","comment":"Accepted to EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2407.04046v1","updated":"2024-07-04T16:41:08Z","published":"2024-07-04T16:41:08Z","title":"Systematic Task Exploration with LLMs: A Study in Citation Text\n Generation","summary":" Large language models (LLMs) bring unprecedented flexibility in defining and\nexecuting complex, creative natural language generation (NLG) tasks. Yet, this\nflexibility brings new challenges, as it introduces new degrees of freedom in\nformulating the task inputs and instructions and in evaluating model\nperformance. To facilitate the exploration of creative NLG tasks, we propose a\nthree-component research framework that consists of systematic input\nmanipulation, reference data, and output measurement. We use this framework to\nexplore citation text generation -- a popular scholarly NLP task that lacks\nconsensus on the task definition and evaluation metric and has not yet been\ntackled within the LLM paradigm. Our results highlight the importance of\nsystematically investigating both task instruction and input configuration when\nprompting LLMs, and reveal non-trivial relationships between different\nevaluation metrics used for citation text generation. Additional human\ngeneration and human evaluation experiments provide new qualitative insights\ninto the task to guide future research in citation text generation. We make our\ncode and data publicly available.\n","authors":["Furkan Şahinuç","Ilia Kuznetsov","Yufang Hou","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2407.04046v1.pdf","comment":"Accepted to ACL 2024 (Main)"},{"id":"http://arxiv.org/abs/2406.00656v2","updated":"2024-07-04T16:23:35Z","published":"2024-06-02T07:57:45Z","title":"Presence or Absence: Are Unknown Word Usages in Dictionaries?","summary":" There has been a surge of interest in computational modeling of semantic\nchange. The foci of previous works are on detecting and interpreting word\nsenses gained over time; however, it remains unclear whether the gained senses\nare covered by dictionaries. In this work, we aim to fill this research gap by\ncomparing detected word senses with dictionary sense inventories in order to\nbridge between the communities of lexical semantic change detection and\nlexicography. We evaluate our system in the AXOLOTL-24 shared task for Finnish,\nRussian and German languages \\cite{fedorova-etal-2024-axolotl}. Our system is\nfully unsupervised. It leverages a graph-based clustering approach to predict\nmappings between unknown word usages and dictionary entries for Subtask 1, and\ngenerates dictionary-like definitions for those novel word usages through the\nstate-of-the-art Large Language Models such as GPT-4 and LLaMA-3 for Subtask 2.\nIn Subtask 1, our system outperforms the baseline system by a large margin, and\nit offers interpretability for the mapping results by distinguishing between\nmatched and unmatched (novel) word usages through our graph-based clustering\napproach. Our system ranks first in Finnish and German, and ranks second in\nRussian on the Subtask 2 test-phase leaderboard. These results show the\npotential of our system in managing dictionary entries, particularly for\nupdating dictionaries to include novel sense entries. Our code and data are\nmade publicly\navailable\\footnote{\\url{https://github.com/xiaohemaikoo/axolotl24-ABDN-NLP}}.\n","authors":["Xianghe Ma","Dominik Schlechtweg","Wei Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.00656v2.pdf","comment":"LChange24 Camera Ready"},{"id":"http://arxiv.org/abs/2407.04020v1","updated":"2024-07-04T15:55:13Z","published":"2024-07-04T15:55:13Z","title":"LLMAEL: Large Language Models are Good Context Augmenters for Entity\n Linking","summary":" Entity Linking (EL) models are well-trained at mapping mentions to their\ncorresponding entities according to a given context. However, EL models\nstruggle to disambiguate long-tail entities due to their limited training data.\nMeanwhile, large language models (LLMs) are more robust at interpreting\nuncommon mentions. Yet, due to a lack of specialized training, LLMs suffer at\ngenerating correct entity IDs. Furthermore, training an LLM to perform EL is\ncost-intensive. Building upon these insights, we introduce LLM-Augmented Entity\nLinking LLMAEL, a plug-and-play approach to enhance entity linking through LLM\ndata augmentation. We leverage LLMs as knowledgeable context augmenters,\ngenerating mention-centered descriptions as additional input, while preserving\ntraditional EL models for task specific processing. Experiments on 6 standard\ndatasets show that the vanilla LLMAEL outperforms baseline EL models in most\ncases, while the fine-tuned LLMAEL set the new state-of-the-art results across\nall 6 benchmarks.\n","authors":["Amy Xin","Yunjia Qi","Zijun Yao","Fangwei Zhu","Kaisheng Zeng","Xu Bin","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2407.04020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04010v1","updated":"2024-07-04T15:38:38Z","published":"2024-07-04T15:38:38Z","title":"Exploring Diachronic and Diatopic Changes in Dialect Continua: Tasks,\n Datasets and Challenges","summary":" Everlasting contact between language communities leads to constant changes in\nlanguages over time, and gives rise to language varieties and dialects.\nHowever, the communities speaking non-standard language are often overlooked by\nnon-inclusive NLP technologies. Recently, there has been a surge of interest in\nstudying diatopic and diachronic changes in dialect NLP, but there is currently\nno research exploring the intersection of both. Our work aims to fill this gap\nby systematically reviewing diachronic and diatopic papers from a unified\nperspective. In this work, we critically assess nine tasks and datasets across\nfive dialects from three language families (Slavic, Romance, and Germanic) in\nboth spoken and written modalities. The tasks covered are diverse, including\ncorpus construction, dialect distance estimation, and dialect geolocation\nprediction, among others. Moreover, we outline five open challenges regarding\nchanges in dialect use over time, the reliability of dialect datasets, the\nimportance of speaker characteristics, limited coverage of dialects, and\nethical considerations in data collection. We hope that our work sheds light on\nfuture research towards inclusive computational methods and datasets for\nlanguage varieties and dialects.\n","authors":["Melis Çelikkol","Lydia Körber","Wei Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.04010v1.pdf","comment":"LChange24 Camera Ready"},{"id":"http://arxiv.org/abs/2407.03994v1","updated":"2024-07-04T15:14:17Z","published":"2024-07-04T15:14:17Z","title":"Unlocking the Potential of Model Merging for Low-Resource Languages","summary":" Adapting large language models (LLMs) to new languages typically involves\ncontinual pre-training (CT) followed by supervised fine-tuning (SFT). However,\nthis CT-then-SFT approach struggles with limited data in the context of\nlow-resource languages, failing to balance language modeling and task-solving\ncapabilities. We thus propose model merging as an alternative for low-resource\nlanguages, combining models with distinct capabilities into a single model\nwithout additional training. We use model merging to develop task-solving LLMs\nfor low-resource languages without SFT data in the target languages. Our\nexperiments based on Llama-2-7B demonstrate that model merging effectively\nendows LLMs for low-resource languages with task-solving abilities,\noutperforming CT-then-SFT in scenarios with extremely scarce data. Observing\nperformance saturation in model merging with more training tokens, we further\nanalyze the merging process and introduce a slack variable to the model merging\nalgorithm to mitigate the loss of important parameters, thereby enhancing\nperformance. We hope that model merging can benefit more human languages\nsuffering from data scarcity with its higher data efficiency.\n","authors":["Mingxu Tao","Chen Zhang","Quzhe Huang","Tianyao Ma","Songfang Huang","Dongyan Zhao","Yansong Feng"],"pdf_url":"https://arxiv.org/pdf/2407.03994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03993v1","updated":"2024-07-04T15:13:59Z","published":"2024-07-04T15:13:59Z","title":"A Survey on Natural Language Counterfactual Generation","summary":" Natural Language Counterfactual generation aims to minimally modify a given\ntext such that the modified text will be classified into a different class. The\ngenerated counterfactuals provide insight into the reasoning behind a model's\npredictions by highlighting which words significantly influence the outcomes.\nAdditionally, they can be used to detect model fairness issues or augment the\ntraining data to enhance the model's robustness. A substantial amount of\nresearch has been conducted to generate counterfactuals for various NLP tasks,\nemploying different models and methodologies. With the rapid growth of studies\nin this field, a systematic review is crucial to guide future researchers and\ndevelopers. To bridge this gap, this survey comprehensively overview textual\ncounterfactual generation methods, particularly including those based on Large\nLanguage Models. We propose a new taxonomy that categorizes the generation\nmethods into four groups and systematically summarize the metrics for\nevaluating the generation quality. Finally, we discuss ongoing research\nchallenges and outline promising directions for future work.\n","authors":["Yongjie Wang","Xiaoqi Qiu","Yu Yue","Xu Guo","Zhiwei Zeng","Yuhong Feng","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2407.03993v1.pdf","comment":"A survey paper"},{"id":"http://arxiv.org/abs/2407.02751v2","updated":"2024-07-04T15:13:24Z","published":"2024-07-03T01:56:00Z","title":"Emotion and Intent Joint Understanding in Multimodal Conversation: A\n Benchmarking Dataset","summary":" Emotion and Intent Joint Understanding in Multimodal Conversation (MC-EIU)\naims to decode the semantic information manifested in a multimodal\nconversational history, while inferring the emotions and intents simultaneously\nfor the current utterance. MC-EIU is enabling technology for many\nhuman-computer interfaces. However, there is a lack of available datasets in\nterms of annotation, modality, language diversity, and accessibility. In this\nwork, we propose an MC-EIU dataset, which features 7 emotion categories, 9\nintent categories, 3 modalities, i.e., textual, acoustic, and visual content,\nand two languages, i.e., English and Mandarin. Furthermore, it is completely\nopen-source for free access. To our knowledge, MC-EIU is the first\ncomprehensive and rich emotion and intent joint understanding dataset for\nmultimodal conversation. Together with the release of the dataset, we also\ndevelop an Emotion and Intent Interaction (EI$^2$) network as a reference\nsystem by modeling the deep correlation between emotion and intent in the\nmultimodal conversation. With comparative experiments and ablation studies, we\ndemonstrate the effectiveness of the proposed EI$^2$ method on the MC-EIU\ndataset. The dataset and codes will be made available at:\nhttps://github.com/MC-EIU/MC-EIU.\n","authors":["Rui Liu","Haolin Zuo","Zheng Lian","Xiaofen Xing","Björn W. Schuller","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2407.02751v2.pdf","comment":"26 pages, 8 figures, 12 tables, NeurIPS 2024 Dataset and Benchmark\n Track"},{"id":"http://arxiv.org/abs/2407.03978v1","updated":"2024-07-04T14:50:45Z","published":"2024-07-04T14:50:45Z","title":"Benchmarking Complex Instruction-Following with Multiple Constraints\n Composition","summary":" Instruction following is one of the fundamental capabilities of large\nlanguage models (LLMs). As the ability of LLMs is constantly improving, they\nhave been increasingly applied to deal with complex human instructions in\nreal-world scenarios. Therefore, how to evaluate the ability of complex\ninstruction-following of LLMs has become a critical research problem. Existing\nbenchmarks mainly focus on modeling different types of constraints in human\ninstructions while neglecting the composition of different constraints, which\nis an indispensable constituent in complex instructions. To this end, we\npropose ComplexBench, a benchmark for comprehensively evaluating the ability of\nLLMs to follow complex instructions composed of multiple constraints. We\npropose a hierarchical taxonomy for complex instructions, including 4\nconstraint types, 19 constraint dimensions, and 4 composition types, and\nmanually collect a high-quality dataset accordingly. To make the evaluation\nreliable, we augment LLM-based evaluators with rules to effectively verify\nwhether generated texts can satisfy each constraint and composition.\nFurthermore, we obtain the final evaluation score based on the dependency\nstructure determined by different composition types. ComplexBench identifies\nsignificant deficiencies in existing LLMs when dealing with complex\ninstructions with multiple constraints composition.\n","authors":["Bosi Wen","Pei Ke","Xiaotao Gu","Lindong Wu","Hao Huang","Jinfeng Zhou","Wenchuang Li","Binxin Hu","Wendy Gao","Jiaxin Xu","Yiming Liu","Jie Tang","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2407.03978v1.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.03974v1","updated":"2024-07-04T14:49:46Z","published":"2024-07-04T14:49:46Z","title":"LLM Roleplay: Simulating Human-Chatbot Interaction","summary":" The development of chatbots requires collecting a large number of\nhuman-chatbot dialogues to reflect the breadth of users' sociodemographic\nbackgrounds and conversational goals. However, the resource requirements to\nconduct the respective user studies can be prohibitively high and often only\nallow for a narrow analysis of specific dialogue goals and participant\ndemographics. In this paper, we propose LLM-Roleplay: a goal-oriented,\npersona-based method to automatically generate diverse multi-turn dialogues\nsimulating human-chatbot interaction. LLM-Roleplay can be applied to generate\ndialogues with any type of chatbot and uses large language models (LLMs) to\nplay the role of textually described personas. To validate our method we\ncollect natural human-chatbot dialogues from different sociodemographic groups\nand conduct a human evaluation to compare real human-chatbot dialogues with our\ngenerated dialogues. We compare the abilities of state-of-the-art LLMs in\nembodying personas and holding a conversation and find that our method can\nsimulate human-chatbot dialogues with a high indistinguishability rate.\n","authors":["Hovhannes Tamoyan","Hendrik Schuff","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2407.03974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03967v1","updated":"2024-07-04T14:36:49Z","published":"2024-07-04T14:36:49Z","title":"Investigating the Role of Instruction Variety and Task Difficulty in\n Robotic Manipulation Tasks","summary":" Evaluating the generalisation capabilities of multimodal models based solely\non their performance on out-of-distribution data fails to capture their true\nrobustness. This work introduces a comprehensive evaluation framework that\nsystematically examines the role of instructions and inputs in the\ngeneralisation abilities of such models, considering architectural design,\ninput perturbations across language and vision modalities, and increased task\ncomplexity. The proposed framework uncovers the resilience of multimodal models\nto extreme instruction perturbations and their vulnerability to observational\nchanges, raising concerns about overfitting to spurious correlations. By\nemploying this evaluation framework on current Transformer-based multimodal\nmodels for robotic manipulation tasks, we uncover limitations and suggest\nfuture advancements should focus on architectural and training innovations that\nbetter integrate multimodal inputs, enhancing a model's generalisation prowess\nby prioritising sensitivity to input content over incidental correlations.\n","authors":["Amit Parekh","Nikolas Vitsakis","Alessandro Suglia","Ioannis Konstas"],"pdf_url":"https://arxiv.org/pdf/2407.03967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03964v1","updated":"2024-07-04T14:33:47Z","published":"2024-07-04T14:33:47Z","title":"Improving Sample Efficiency of Reinforcement Learning with Background\n Knowledge from Large Language Models","summary":" Low sample efficiency is an enduring challenge of reinforcement learning\n(RL). With the advent of versatile large language models (LLMs), recent works\nimpart common-sense knowledge to accelerate policy learning for RL processes.\nHowever, we note that such guidance is often tailored for one specific task but\nloses generalizability. In this paper, we introduce a framework that harnesses\nLLMs to extract background knowledge of an environment, which contains general\nunderstandings of the entire environment, making various downstream RL tasks\nbenefit from one-time knowledge representation. We ground LLMs by feeding a few\npre-collected experiences and requesting them to delineate background knowledge\nof the environment. Afterward, we represent the output knowledge as potential\nfunctions for potential-based reward shaping, which has a good property for\nmaintaining policy optimality from task rewards. We instantiate three variants\nto prompt LLMs for background knowledge, including writing code, annotating\npreferences, and assigning goals. Our experiments show that these methods\nachieve significant sample efficiency improvements in a spectrum of downstream\ntasks from Minigrid and Crafter domains.\n","authors":["Fuxiang Zhang","Junyou Li","Yi-Chen Li","Zongzhang Zhang","Yang Yu","Deheng Ye"],"pdf_url":"https://arxiv.org/pdf/2407.03964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03963v1","updated":"2024-07-04T14:33:03Z","published":"2024-07-04T14:33:03Z","title":"LLM-jp: A Cross-organizational Project for the Research and Development\n of Fully Open Japanese LLMs","summary":" This paper introduces LLM-jp, a cross-organizational project for the research\nand development of Japanese large language models (LLMs). LLM-jp aims to\ndevelop open-source and strong Japanese LLMs, and as of this writing, more than\n1,500 participants from academia and industry are working together for this\npurpose. This paper presents the background of the establishment of LLM-jp,\nsummaries of its activities, and technical reports on the LLMs developed by\nLLM-jp. For the latest activities, visit https://llm-jp.nii.ac.jp/en/.\n","authors":[" LLM-jp"," :","Akiko Aizawa","Eiji Aramaki","Bowen Chen","Fei Cheng","Hiroyuki Deguchi","Rintaro Enomoto","Kazuki Fujii","Kensuke Fukumoto","Takuya Fukushima","Namgi Han","Yuto Harada","Chikara Hashimoto","Tatsuya Hiraoka","Shohei Hisada","Sosuke Hosokawa","Lu Jie","Keisuke Kamata","Teruhito Kanazawa","Hiroki Kanezashi","Hiroshi Kataoka","Satoru Katsumata","Daisuke Kawahara","Seiya Kawano","Atsushi Keyaki","Keisuke Kiryu","Hirokazu Kiyomaru","Takashi Kodama","Takahiro Kubo","Yohei Kuga","Ryoma Kumon","Shuhei Kurita","Sadao Kurohashi","Conglong Li","Taiki Maekawa","Hiroshi Matsuda","Yusuke Miyao","Kentaro Mizuki","Sakae Mizuki","Yugo Murawaki","Ryo Nakamura","Taishi Nakamura","Kouta Nakayama","Tomoka Nakazato","Takuro Niitsuma","Jiro Nishitoba","Yusuke Oda","Hayato Ogawa","Takumi Okamoto","Naoaki Okazaki","Yohei Oseki","Shintaro Ozaki","Koki Ryu","Rafal Rzepka","Keisuke Sakaguchi","Shota Sasaki","Satoshi Sekine","Kohei Suda","Saku Sugawara","Issa Sugiura","Hiroaki Sugiyama","Hisami Suzuki","Jun Suzuki","Toyotaro Suzumura","Kensuke Tachibana","Yu Takagi","Kyosuke Takami","Koichi Takeda","Masashi Takeshita","Masahiro Tanaka","Kenjiro Taura","Arseny Tolmachev","Nobuhiro Ueda","Zhen Wan","Shuntaro Yada","Sakiko Yahata","Yuya Yamamoto","Yusuke Yamauchi","Hitomi Yanaka","Rio Yokota","Koichiro Yoshino"],"pdf_url":"https://arxiv.org/pdf/2407.03963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03958v1","updated":"2024-07-04T14:26:49Z","published":"2024-07-04T14:26:49Z","title":"Stark: Social Long-Term Multi-Modal Conversation with Persona\n Commonsense Knowledge","summary":" Humans share a wide variety of images related to their personal experiences\nwithin conversations via instant messaging tools. However, existing works focus\non (1) image-sharing behavior in singular sessions, leading to limited\nlong-term social interaction, and (2) a lack of personalized image-sharing\nbehavior. In this work, we introduce Stark, a large-scale long-term multi-modal\nconversation dataset that covers a wide range of social personas in a\nmulti-modality format, time intervals, and images. To construct Stark\nautomatically, we propose a novel multi-modal contextualization framework, Mcu,\nthat generates long-term multi-modal dialogue distilled from ChatGPT and our\nproposed Plan-and-Execute image aligner. Using our Stark, we train a\nmulti-modal conversation model, Ultron 7B, which demonstrates impressive visual\nimagination ability. Furthermore, we demonstrate the effectiveness of our\ndataset in human evaluation. We make our source code and dataset publicly\navailable.\n","authors":["Young-Jun Lee","Dokyong Lee","Junyoung Youn","Kyeongjin Oh","Byungsoo Ko","Jonghwan Hyeon","Ho-Jin Choi"],"pdf_url":"https://arxiv.org/pdf/2407.03958v1.pdf","comment":"Project website: https://stark-dataset.github.io"},{"id":"http://arxiv.org/abs/2407.03956v1","updated":"2024-07-04T14:22:25Z","published":"2024-07-04T14:22:25Z","title":"Solving Zebra Puzzles Using Constraint-Guided Multi-Agent Systems","summary":" Prior research has enhanced the ability of Large Language Models (LLMs) to\nsolve logic puzzles using techniques such as chain-of-thought prompting or\nintroducing a symbolic representation. These frameworks are still usually\ninsufficient to solve complicated logical problems, such as Zebra puzzles, due\nto the inherent complexity of translating natural language clues into logical\nstatements. We introduce a multi-agent system, ZPS, that integrates LLMs with\nan off the shelf theorem prover. This system tackles the complex puzzle-solving\ntask by breaking down the problem into smaller, manageable parts, generating\nSMT (Satisfiability Modulo Theories) code to solve them with a theorem prover,\nand using feedback between the agents to repeatedly improve their answers. We\nalso introduce an automated grid puzzle grader to assess the correctness of our\npuzzle solutions and show that the automated grader is reliable by evaluating\nit in a user-study. Our approach shows improvement in all three LLMs we tested,\nwith GPT-4 showing 166% improvement in the number of fully correct solutions.\n","authors":["Shmuel Berman","Baishakhi Ray","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2407.03956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01549v2","updated":"2024-07-04T14:21:39Z","published":"2024-06-03T17:31:06Z","title":"An Information Bottleneck Perspective for Effective Noise Filtering on\n Retrieval-Augmented Generation","summary":" Retrieval-augmented generation integrates the capabilities of large language\nmodels with relevant information retrieved from an extensive corpus, yet\nencounters challenges when confronted with real-world noisy data. One recent\nsolution is to train a filter module to find relevant content but only achieve\nsuboptimal noise compression. In this paper, we propose to introduce the\ninformation bottleneck theory into retrieval-augmented generation. Our approach\ninvolves the filtration of noise by simultaneously maximizing the mutual\ninformation between compression and ground output, while minimizing the mutual\ninformation between compression and retrieved passage. In addition, we derive\nthe formula of information bottleneck to facilitate its application in novel\ncomprehensive evaluations, the selection of supervised fine-tuning data, and\nthe construction of reinforcement learning rewards. Experimental results\ndemonstrate that our approach achieves significant improvements across various\nquestion answering datasets, not only in terms of the correctness of answer\ngeneration but also in the conciseness with $2.5\\%$ compression rate.\n","authors":["Kun Zhu","Xiaocheng Feng","Xiyuan Du","Yuxuan Gu","Weijiang Yu","Haotian Wang","Qianglong Chen","Zheng Chu","Jingchang Chen","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2406.01549v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2407.03955v1","updated":"2024-07-04T14:20:12Z","published":"2024-07-04T14:20:12Z","title":"Meta-prompting Optimized Retrieval-augmented Generation","summary":" Retrieval-augmented generation resorts to content retrieved from external\nsources in order to leverage the performance of large language models in\ndownstream tasks. The excessive volume of retrieved content, the possible\ndispersion of its parts, or their out of focus range may happen nevertheless to\neventually have a detrimental rather than an incremental effect. To mitigate\nthis issue and improve retrieval-augmented generation, we propose a method to\nrefine the retrieved content before it is included in the prompt by resorting\nto meta-prompting optimization. Put to empirical test with the demanding\nmulti-hop question answering task from the StrategyQA dataset, the evaluation\nresults indicate that this method outperforms a similar retrieval-augmented\nsystem but without this method by over 30%.\n","authors":["João Rodrigues","António Branco"],"pdf_url":"https://arxiv.org/pdf/2407.03955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13627v2","updated":"2024-07-04T14:14:38Z","published":"2024-04-21T11:51:13Z","title":"NegotiationToM: A Benchmark for Stress-testing Machine Theory of Mind on\n Negotiation Surrounding","summary":" Large Language Models (LLMs) have sparked substantial interest and debate\nconcerning their potential emergence of Theory of Mind (ToM) ability. Theory of\nmind evaluations currently focuses on testing models using machine-generated\ndata or game settings prone to shortcuts and spurious correlations, which lacks\nevaluation of machine ToM ability in real-world human interaction scenarios.\nThis poses a pressing demand to develop new real-world scenario benchmarks. We\nintroduce NegotiationToM, a new benchmark designed to stress-test machine ToM\nin real-world negotiation surrounding covered multi-dimensional mental states\n(i.e., desires, beliefs, and intentions). Our benchmark builds upon the\nBelief-Desire-Intention (BDI) agent modeling theory and conducts the necessary\nempirical experiments to evaluate large language models. Our findings\ndemonstrate that NegotiationToM is challenging for state-of-the-art LLMs, as\nthey consistently perform significantly worse than humans, even when employing\nthe chain-of-thought (CoT) method.\n","authors":["Chunkit Chan","Cheng Jiayang","Yauwai Yim","Zheye Deng","Wei Fan","Haoran Li","Xin Liu","Hongming Zhang","Weiqi Wang","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2404.13627v2.pdf","comment":"Dataset: https://github.com/HKUST-KnowComp/NegotiationToM"},{"id":"http://arxiv.org/abs/2407.03952v1","updated":"2024-07-04T14:13:57Z","published":"2024-07-04T14:13:57Z","title":"A framework for annotating and modelling intentions behind metaphor use","summary":" Metaphors are part of everyday language and shape the way in which we\nconceptualize the world. Moreover, they play a multifaceted role in\ncommunication, making their understanding and generation a challenging task for\nlanguage models (LMs). While there has been extensive work in the literature\nlinking metaphor to the fulfilment of individual intentions, no comprehensive\ntaxonomy of such intentions, suitable for natural language processing (NLP)\napplications, is available to present day. In this paper, we propose a novel\ntaxonomy of intentions commonly attributed to metaphor, which comprises 9\ncategories. We also release the first dataset annotated for intentions behind\nmetaphor use. Finally, we use this dataset to test the capability of large\nlanguage models (LLMs) in inferring the intentions behind metaphor use, in\nzero- and in-context few-shot settings. Our experiments show that this is still\na challenge for LLMs.\n","authors":["Gianluca Michelli","Xiaoyu Tong","Ekaterina Shutova"],"pdf_url":"https://arxiv.org/pdf/2407.03952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14804v2","updated":"2024-07-04T13:55:33Z","published":"2023-10-23T10:59:21Z","title":"Large Language Models can Share Images, Too!","summary":" This paper explores the image-sharing capability of Large Language Models\n(LLMs), such as GPT-4 and LLaMA 2, in a zero-shot setting. To facilitate a\ncomprehensive evaluation of LLMs, we introduce the PhotoChat++ dataset, which\nincludes enriched annotations (i.e., intent, triggering sentence, image\ndescription, and salient information). Furthermore, we present the\ngradient-free and extensible Decide, Describe, and Retrieve (DribeR) framework.\nWith extensive experiments, we unlock the image-sharing capability of DribeR\nequipped with LLMs in zero-shot prompting, with ChatGPT achieving the best\nperformance. Our findings also reveal the emergent image-sharing ability in\nLLMs under zero-shot conditions, validating the effectiveness of DribeR. We use\nthis framework to demonstrate its practicality and effectiveness in two\nreal-world scenarios: (1) human-bot interaction and (2) dataset augmentation.\nTo the best of our knowledge, this is the first study to assess the\nimage-sharing ability of various LLMs in a zero-shot setting. We make our\nsource code and dataset publicly available at\nhttps://github.com/passing2961/DribeR.\n","authors":["Young-Jun Lee","Dokyong Lee","Joo Won Sung","Jonghwan Hyeon","Ho-Jin Choi"],"pdf_url":"https://arxiv.org/pdf/2310.14804v2.pdf","comment":"ACL 2024 Findings; Code is available in\n https://github.com/passing2961/DribeR"},{"id":"http://arxiv.org/abs/2402.04854v5","updated":"2024-07-04T13:54:25Z","published":"2024-02-07T13:54:06Z","title":"Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey","summary":" Research surveys have always posed a challenge for beginner researchers who\nlack of research training. These researchers struggle to understand the\ndirections within their research topic, and the discovery of new research\nfindings within a short time. One way to provide intuitive assistance to\nbeginner researchers is by offering relevant knowledge graphs(KG) and\nrecommending related academic papers. However, existing navigation knowledge\ngraphs primarily rely on keywords in the research field and often fail to\npresent the logical hierarchy among multiple related papers clearly. Moreover,\nmost recommendation systems for academic papers simply rely on high text\nsimilarity, which can leave researchers confused as to why a particular article\nis being recommended. They may lack of grasp important information about the\ninsight connection between \"Issue resolved\" and \"Issue finding\" that they hope\nto obtain. To address these issues, this study aims to support research insight\nsurveys for beginner researchers by establishing a hierarchical tree-structured\nknowledge graph that reflects the inheritance insight of research topics and\nthe relevance insight among the academic papers.\n","authors":["Jinghong Li","Huy Phan","Wen Gu","Koichi Ota","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2402.04854v5.pdf","comment":"This paper has been accepted by 'The 18TH International Conference on\n INnovations in Intelligent SysTems and Applications (INISTA 2024)'"},{"id":"http://arxiv.org/abs/2407.03941v1","updated":"2024-07-04T13:54:24Z","published":"2024-07-04T13:54:24Z","title":"Narrow Transformer: Starcoder-Based Java-LM For Desktop","summary":" This paper presents NT-Java-1.1B, an open-source specialized code language\nmodel built on StarCoderBase-1.1B, designed for coding tasks in Java\nprogramming. NT-Java-1.1B achieves state-of-the-art performance, surpassing its\nbase model and majority of other models of similar size on MultiPL-E Java code\nbenchmark. While there have been studies on extending large, generic\npre-trained models to improve proficiency in specific programming languages\nlike Python, similar investigations on small code models for other programming\nlanguages are lacking. Large code models require specialized hardware like GPUs\nfor inference, highlighting the need for research into building small code\nmodels that can be deployed on developer desktops. This paper addresses this\nresearch gap by focusing on the development of a small Java code model,\nNT-Java-1.1B, and its quantized versions, which performs comparably to open\nmodels around 1.1B on MultiPL-E Java code benchmarks, making them ideal for\ndesktop deployment. This paper establishes the foundation for specialized\nmodels across languages and sizes for a family of NT Models.\n","authors":["Kamalkumar Rathinasamy","Balaji A J","Ankush Kumar","Gagan Gayari","Harshini K","Rajab Ali Mondal","Sreenivasa Raghavan K S","Swayam Singh"],"pdf_url":"https://arxiv.org/pdf/2407.03941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03937v1","updated":"2024-07-04T13:52:23Z","published":"2024-07-04T13:52:23Z","title":"TongGu: Mastering Classical Chinese Understanding with\n Knowledge-Grounded Large Language Models","summary":" Classical Chinese is a gateway to the rich heritage and wisdom of ancient\nChina, yet its complexities pose formidable comprehension barriers for most\nmodern people without specialized knowledge. While Large Language Models (LLMs)\nhave shown remarkable capabilities in Natural Language Processing (NLP), they\nstruggle with Classical Chinese Understanding (CCU), especially in\ndata-demanding and knowledge-intensive tasks. In response to this dilemma, we\npropose \\textbf{TongGu} (mean understanding ancient and modern), the first\nCCU-specific LLM, underpinned by three core contributions. First, we construct\na two-stage instruction-tuning dataset ACCN-INS derived from rich classical\nChinese corpora, aiming to unlock the full CCU potential of LLMs. Second, we\npropose Redundancy-Aware Tuning (RAT) to prevent catastrophic forgetting,\nenabling TongGu to acquire new capabilities while preserving its foundational\nknowledge. Third, we present a CCU Retrieval-Augmented Generation (CCU-RAG)\ntechnique to reduce hallucinations based on knowledge-grounding. Extensive\nexperiments across 24 diverse CCU tasks validate TongGu's superior ability,\nunderscoring the effectiveness of RAT and CCU-RAG. The model and dataset will\nbe public available.\n","authors":["Jiahuan Cao","Dezhi Peng","Peirong Zhang","Yongxin Shi","Yang Liu","Kai Ding","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2407.03937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03916v1","updated":"2024-07-04T13:21:07Z","published":"2024-07-04T13:21:07Z","title":"Entity-Level Sentiment: More than the Sum of Its Parts","summary":" In sentiment analysis of longer texts, there may be a variety of topics\ndiscussed, of entities mentioned, and of sentiments expressed regarding each\nentity. We find a lack of studies exploring how such texts express their\nsentiment towards each entity of interest, and how these sentiments can be\nmodelled. In order to better understand how sentiment regarding persons and\norganizations (each entity in our scope) is expressed in longer texts, we have\ncollected a dataset of expert annotations where the overall sentiment regarding\neach entity is identified, together with the sentence-level sentiment for these\nentities separately. We show that the reader's perceived sentiment regarding an\nentity often differs from an arithmetic aggregation of sentiments at the\nsentence level. Only 70\\% of the positive and 55\\% of the negative entities\nreceive a correct overall sentiment label when we aggregate the\n(human-annotated) sentiment labels for the sentences where the entity is\nmentioned. Our dataset reveals the complexity of entity-specific sentiment in\nlonger texts, and allows for more precise modelling and evaluation of such\nsentiment expressions.\n","authors":["Egil Rønningstad","Roman Klinger","Erik Velldal","Lilja Øvrelid"],"pdf_url":"https://arxiv.org/pdf/2407.03916v1.pdf","comment":"14th Workshop on Computational Approaches to Subjectivity, Sentiment\n & Social Media Analysis (WASSA 2024)"},{"id":"http://arxiv.org/abs/2403.06412v4","updated":"2024-07-04T13:08:19Z","published":"2024-03-11T03:54:33Z","title":"CLIcK: A Benchmark Dataset of Cultural and Linguistic Intelligence in\n Korean","summary":" Despite the rapid development of large language models (LLMs) for the Korean\nlanguage, there remains an obvious lack of benchmark datasets that test the\nrequisite Korean cultural and linguistic knowledge. Because many existing\nKorean benchmark datasets are derived from the English counterparts through\ntranslation, they often overlook the different cultural contexts. For the few\nbenchmark datasets that are sourced from Korean data capturing cultural\nknowledge, only narrow tasks such as bias and hate speech detection are\noffered. To address this gap, we introduce a benchmark of Cultural and\nLinguistic Intelligence in Korean (CLIcK), a dataset comprising 1,995 QA pairs.\nCLIcK sources its data from official Korean exams and textbooks, partitioning\nthe questions into eleven categories under the two main categories of language\nand culture. For each instance in CLIcK, we provide fine-grained annotation of\nwhich cultural and linguistic knowledge is required to answer the question\ncorrectly. Using CLIcK, we test 13 language models to assess their performance.\nOur evaluation uncovers insights into their performances across the categories,\nas well as the diverse factors affecting their comprehension. CLIcK offers the\nfirst large-scale comprehensive Korean-centric analysis of LLMs' proficiency in\nKorean culture and language.\n","authors":["Eunsu Kim","Juyoung Suk","Philhoon Oh","Haneul Yoo","James Thorne","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2403.06412v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02834v2","updated":"2024-07-04T12:57:37Z","published":"2024-07-03T06:21:07Z","title":"Aspect-Based Sentiment Analysis Techniques: A Comparative Study","summary":" Since the dawn of the digitalisation era, customer feedback and online\nreviews are unequivocally major sources of insights for businesses.\nConsequently, conducting comparative analyses of such sources has become the de\nfacto modus operandi of any business that wishes to give itself a competitive\nedge over its peers and improve customer loyalty. Sentiment analysis is one\nsuch method instrumental in gauging public interest, exposing market trends,\nand analysing competitors. While traditional sentiment analysis focuses on\noverall sentiment, as the needs advance with time, it has become important to\nexplore public opinions and sentiments on various specific subjects, products\nand services mentioned in the reviews on a finer-granular level. To this end,\nAspect-based Sentiment Analysis (ABSA), supported by advances in Artificial\nIntelligence (AI) techniques which have contributed to a paradigm shift from\nsimple word-level analysis to tone and context-aware analyses, focuses on\nidentifying specific aspects within the text and determining the sentiment\nassociated with each aspect. In this study, we compare several deep-NN methods\nfor ABSA on two benchmark datasets (Restaurant14 and Laptop-14) and found that\nFAST LSA obtains the best overall results of 87.6% and 82.6% accuracy but does\nnot pass LSA+DeBERTa which reports 90.33% and 86.21% accuracy respectively.\n","authors":["Dineth Jayakody","Koshila Isuranda","A V A Malkith","Nisansa de Silva","Sachintha Rajith Ponnamperuma","G G N Sandamali","K L K Sudheera"],"pdf_url":"https://arxiv.org/pdf/2407.02834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14499v2","updated":"2024-07-04T12:51:29Z","published":"2024-02-22T12:47:33Z","title":"\"My Answer is C\": First-Token Probabilities Do Not Match Text Answers in\n Instruction-Tuned Language Models","summary":" The open-ended nature of language generation makes the evaluation of\nautoregressive large language models (LLMs) challenging. One common evaluation\napproach uses multiple-choice questions (MCQ) to limit the response space. The\nmodel is then evaluated by ranking the candidate answers by the log probability\nof the first token prediction. However, first-tokens may not consistently\nreflect the final response output, due to model's diverse response styles such\nas starting with \"Sure\" or refusing to answer. Consequently, MCQ evaluation is\nnot indicative of model behaviour when interacting with users. But by how much?\nWe evaluate how aligned first-token evaluation is with the text output along\nseveral dimensions, namely final option choice, refusal rate, choice\ndistribution and robustness under prompt perturbation. Our results show that\nthe two approaches are severely misaligned on all dimensions, reaching mismatch\nrates over 60%. Models heavily fine-tuned on conversational or safety data are\nespecially impacted. Crucially, models remain misaligned even when we\nincreasingly constrain prompts, i.e., force them to start with an option letter\nor example template. Our findings i) underscore the importance of inspecting\nthe text output as well and ii) caution against relying solely on first-token\nevaluation.\n","authors":["Xinpeng Wang","Bolei Ma","Chengzhi Hu","Leon Weber-Genzel","Paul Röttger","Frauke Kreuter","Dirk Hovy","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2402.14499v2.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2407.03895v1","updated":"2024-07-04T12:40:35Z","published":"2024-07-04T12:40:35Z","title":"Scoping Review of Active Learning Strategies and their Evaluation\n Environments for Entity Recognition Tasks","summary":" We conducted a scoping review for active learning in the domain of natural\nlanguage processing (NLP), which we summarize in accordance with the PRISMA-ScR\nguidelines as follows:\n Objective: Identify active learning strategies that were proposed for entity\nrecognition and their evaluation environments (datasets, metrics, hardware,\nexecution time). Design: We used Scopus and ACM as our search engines. We\ncompared the results with two literature surveys to assess the search quality.\nWe included peer-reviewed English publications introducing or comparing active\nlearning strategies for entity recognition. Results: We analyzed 62 relevant\npapers and identified 106 active learning strategies. We grouped them into\nthree categories: exploitation-based (60x), exploration-based (14x), and hybrid\nstrategies (32x). We found that all studies used the F1-score as an evaluation\nmetric. Information about hardware (6x) and execution time (13x) was only\noccasionally included. The 62 papers used 57 different datasets to evaluate\ntheir respective strategies. Most datasets contained newspaper articles or\nbiomedical/medical data. Our analysis revealed that 26 out of 57 datasets are\npublicly accessible.\n Conclusion: Numerous active learning strategies have been identified, along\nwith significant open questions that still need to be addressed. Researchers\nand practitioners face difficulties when making data-driven decisions about\nwhich active learning strategy to adopt. Conducting comprehensive empirical\ncomparisons using the evaluation environment proposed in this study could help\nestablish best practices in the domain.\n","authors":["Philipp Kohl","Yoka Krämer","Claudia Fohry","Bodo Kraft"],"pdf_url":"https://arxiv.org/pdf/2407.03895v1.pdf","comment":"The Version of Record of this contribution is published in Deep\n Learning Theory and Applications 5th International Conference, DeLTA 2024\n Proceedings, and will be available after the conference"},{"id":"http://arxiv.org/abs/2402.14016v2","updated":"2024-07-04T12:34:44Z","published":"2024-02-21T18:55:20Z","title":"Is LLM-as-a-Judge Robust? Investigating Universal Adversarial Attacks on\n Zero-shot LLM Assessment","summary":" Large Language Models (LLMs) are powerful zero-shot assessors used in\nreal-world situations such as assessing written exams and benchmarking systems.\nDespite these critical applications, no existing work has analyzed the\nvulnerability of judge-LLMs to adversarial manipulation. This work presents the\nfirst study on the adversarial robustness of assessment LLMs, where we\ndemonstrate that short universal adversarial phrases can be concatenated to\ndeceive judge LLMs to predict inflated scores. Since adversaries may not know\nor have access to the judge-LLMs, we propose a simple surrogate attack where a\nsurrogate model is first attacked, and the learned attack phrase then\ntransferred to unknown judge-LLMs. We propose a practical algorithm to\ndetermine the short universal attack phrases and demonstrate that when\ntransferred to unseen models, scores can be drastically inflated such that\nirrespective of the assessed text, maximum scores are predicted. It is found\nthat judge-LLMs are significantly more susceptible to these adversarial attacks\nwhen used for absolute scoring, as opposed to comparative assessment. Our\nfindings raise concerns on the reliability of LLM-as-a-judge methods, and\nemphasize the importance of addressing vulnerabilities in LLM assessment\nmethods before deployment in high-stakes real-world scenarios.\n","authors":["Vyas Raina","Adian Liusie","Mark Gales"],"pdf_url":"https://arxiv.org/pdf/2402.14016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03884v1","updated":"2024-07-04T12:23:02Z","published":"2024-07-04T12:23:02Z","title":"Planning with Large Language Models for Conversational Agents","summary":" Controllability and proactivity are crucial properties of autonomous\nconversational agents (CAs). Controllability requires the CAs to follow the\nstandard operating procedures (SOPs), such as verifying identity before\nactivating credit cards. Proactivity requires the CAs to guide the conversation\ntowards the goal during user uncooperation, such as persuasive dialogue.\nExisting research cannot be unified with controllability, proactivity, and low\nmanual annotation. To bridge this gap, we propose a new framework for\nplanning-based conversational agents (PCA) powered by large language models\n(LLMs), which only requires humans to define tasks and goals for the LLMs.\nBefore conversation, LLM plans the core and necessary SOP for dialogue offline.\nDuring the conversation, LLM plans the best action path online referring to the\nSOP, and generates responses to achieve process controllability. Subsequently,\nwe propose a semi-automatic dialogue data creation framework and curate a\nhigh-quality dialogue dataset (PCA-D). Meanwhile, we develop multiple variants\nand evaluation metrics for PCA, e.g., planning with Monte Carlo Tree Search\n(PCA-M), which searches for the optimal dialogue action while satisfying SOP\nconstraints and achieving the proactive of the dialogue. Experiment results\nshow that LLMs finetuned on PCA-D can significantly improve the performance and\ngeneralize to unseen domains. PCA-M outperforms other CoT and ToT baselines in\nterms of conversation controllability, proactivity, task success rate, and\noverall logical coherence, and is applicable in industry dialogue scenarios.\nThe dataset and codes are available at XXXX.\n","authors":["Zhigen Li","Jianxiang Peng","Yanmeng Wang","Tianhao Shen","Minghui Zhang","Linxi Su","Shang Wu","Yihang Wu","Yuqian Wang","Ye Wang","Wei Hu","Jianfeng Li","Shaojun Wang","Jing Xiao","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2407.03884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03876v1","updated":"2024-07-04T12:14:27Z","published":"2024-07-04T12:14:27Z","title":"DART: Deep Adversarial Automated Red Teaming for LLM Safety","summary":" Manual Red teaming is a commonly-used method to identify vulnerabilities in\nlarge language models (LLMs), which, is costly and unscalable. In contrast,\nautomated red teaming uses a Red LLM to automatically generate adversarial\nprompts to the Target LLM, offering a scalable way for safety vulnerability\ndetection. However, the difficulty of building a powerful automated Red LLM\nlies in the fact that the safety vulnerabilities of the Target LLM are\ndynamically changing with the evolution of the Target LLM. To mitigate this\nissue, we propose a Deep Adversarial Automated Red Teaming (DART) framework in\nwhich the Red LLM and Target LLM are deeply and dynamically interacting with\neach other in an iterative manner. In each iteration, in order to generate\nsuccessful attacks as many as possible, the Red LLM not only takes into account\nthe responses from the Target LLM, but also adversarially adjust its attacking\ndirections by monitoring the global diversity of generated attacks across\nmultiple iterations. Simultaneously, to explore dynamically changing safety\nvulnerabilities of the Target LLM, we allow the Target LLM to enhance its\nsafety via an active learning based data selection mechanism. Experimential\nresults demonstrate that DART significantly reduces the safety risk of the\ntarget LLM. For human evaluation on Anthropic Harmless dataset, compared to the\ninstruction-tuning target LLM, DART eliminates the violation risks by 53.4\\%.\nWe will release the datasets and codes of DART soon.\n","authors":["Bojian Jiang","Yi Jing","Tianhao Shen","Qing Yang","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2407.03876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11225v2","updated":"2024-07-04T11:52:11Z","published":"2024-04-17T10:19:15Z","title":"In-Context Learning State Vector with Inner and Momentum Optimization","summary":" Large Language Models (LLMs) have exhibited an impressive ability to perform\nIn-Context Learning (ICL) from only a few examples. Recent works have indicated\nthat the functions learned by ICL can be represented through compressed vectors\nderived from the transformer. However, the working mechanisms and optimization\nof these vectors are yet to be thoroughly explored. In this paper, we address\nthis gap by presenting a comprehensive analysis of these compressed vectors,\ndrawing parallels to the parameters trained with gradient descent, and\nintroduce the concept of state vector. Inspired by the works on model soup and\nmomentum-based gradient descent, we propose inner and momentum optimization\nmethods that are applied to refine the state vector progressively as test-time\nadaptation. Moreover, we simulate state vector aggregation in the multiple\nexample setting, where demonstrations comprising numerous examples are usually\ntoo lengthy for regular ICL, and further propose a divide-and-conquer\naggregation method to address this challenge. We conduct extensive experiments\nusing Llama-2 and GPT-J in both zero-shot setting and few-shot setting. The\nexperimental results show that our optimization method effectively enhances the\nstate vector and achieves the state-of-the-art performance on diverse tasks.\nCode is available at https://github.com/HITsz-TMG/ICL-State-Vector\n","authors":["Dongfang Li","Zhenyu Liu","Xinshuo Hu","Zetian Sun","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.11225v2.pdf","comment":"17 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.03861v1","updated":"2024-07-04T11:46:39Z","published":"2024-07-04T11:46:39Z","title":"TartuNLP @ AXOLOTL-24: Leveraging Classifier Output for New Sense\n Detection in Lexical Semantics","summary":" We present our submission to the AXOLOTL-24 shared task. The shared task\ncomprises two subtasks: identifying new senses that words gain with time (when\ncomparing newer and older time periods) and producing the definitions for the\nidentified new senses. We implemented a conceptually simple and computationally\ninexpensive solution to both subtasks. We trained adapter-based binary\nclassification models to match glosses with usage examples and leveraged the\nprobability output of the models to identify novel senses. The same models were\nused to match examples of novel sense usages with Wiktionary definitions. Our\nsubmission attained third place on the first subtask and the first place on the\nsecond subtask.\n","authors":["Aleksei Dorkin","Kairit Sirts"],"pdf_url":"https://arxiv.org/pdf/2407.03861v1.pdf","comment":"Accepted to the 5th International Workshop on Computational\n Approaches to Historical Language Change 2024 (LChange'24)"},{"id":"http://arxiv.org/abs/2407.03859v1","updated":"2024-07-04T11:44:28Z","published":"2024-07-04T11:44:28Z","title":"Anthropocentric bias and the possibility of artificial cognition","summary":" Evaluating the cognitive capacities of large language models (LLMs) requires\novercoming not only anthropomorphic but also anthropocentric biases. This\narticle identifies two types of anthropocentric bias that have been neglected:\noverlooking how auxiliary factors can impede LLM performance despite competence\n(Type-I), and dismissing LLM mechanistic strategies that differ from those of\nhumans as not genuinely competent (Type-II). Mitigating these biases\nnecessitates an empirically-driven, iterative approach to mapping cognitive\ntasks to LLM-specific capacities and mechanisms, which can be done by\nsupplementing carefully designed behavioral experiments with mechanistic\nstudies.\n","authors":["Raphaël Millière","Charles Rathkopf"],"pdf_url":"https://arxiv.org/pdf/2407.03859v1.pdf","comment":"Accepted for ICML 2024 (Workshop on Large Language Models and\n Cognition)"},{"id":"http://arxiv.org/abs/2407.03850v1","updated":"2024-07-04T11:33:54Z","published":"2024-07-04T11:33:54Z","title":"HYBRINFOX at CheckThat! 2024 -- Task 1: Enhancing Language Models with\n Structured Information for Check-Worthiness Estimation","summary":" This paper summarizes the experiments and results of the HYBRINFOX team for\nthe CheckThat! 2024 - Task 1 competition. We propose an approach enriching\nLanguage Models such as RoBERTa with embeddings produced by triples (subject ;\npredicate ; object) extracted from the text sentences. Our analysis of the\ndevelopmental data shows that this method improves the performance of Language\nModels alone. On the evaluation data, its best performance was in English,\nwhere it achieved an F1 score of 71.1 and ranked 12th out of 27 candidates. On\nthe other languages (Dutch and Arabic), it obtained more mixed results. Future\nresearch tracks are identified toward adapting this processing pipeline to more\nrecent Large Language Models.\n","authors":["Géraud Faye","Morgane Casanova","Benjamin Icard","Julien Chanson","Guillaume Gadek","Guillaume Gravier","Paul Égré"],"pdf_url":"https://arxiv.org/pdf/2407.03850v1.pdf","comment":"Paper to appear in the Proceedings of the Conference and Labs of the\n Evaluation Forum (CLEF 2024 CheckThat!)"},{"id":"http://arxiv.org/abs/2407.03841v1","updated":"2024-07-04T11:14:47Z","published":"2024-07-04T11:14:47Z","title":"On the Benchmarking of LLMs for Open-Domain Dialogue Evaluation","summary":" Large Language Models (LLMs) have showcased remarkable capabilities in\nvarious Natural Language Processing tasks. For automatic open-domain dialogue\nevaluation in particular, LLMs have been seamlessly integrated into evaluation\nframeworks, and together with human evaluation, compose the backbone of most\nevaluations. However, existing evaluation benchmarks often rely on outdated\ndatasets and evaluate aspects like Fluency and Relevance, which fail to\nadequately capture the capabilities and limitations of state-of-the-art chatbot\nmodels.\n This paper critically examines current evaluation benchmarks, highlighting\nthat the use of older response generators and quality aspects fail to\naccurately reflect modern chatbot capabilities. A small annotation experiment\non a recent LLM-generated dataset (SODA) reveals that LLM evaluators such as\nGPT-4 struggle to detect actual deficiencies in dialogues generated by current\nLLM chatbots.\n","authors":["John Mendonça","Alon Lavie","Isabel Trancoso"],"pdf_url":"https://arxiv.org/pdf/2407.03841v1.pdf","comment":"Accepted to the 6th NLP for Conversational AI workshop at ACL"},{"id":"http://arxiv.org/abs/2407.03818v1","updated":"2024-07-04T10:44:59Z","published":"2024-07-04T10:44:59Z","title":"ConText at WASSA 2024 Empathy and Personality Shared Task:\n History-Dependent Embedding Utterance Representations for Empathy and Emotion\n Prediction in Conversations","summary":" Empathy and emotion prediction are key components in the development of\neffective and empathetic agents, amongst several other applications. The WASSA\nshared task on empathy and emotion prediction in interactions presents an\nopportunity to benchmark approaches to these tasks. Appropriately selecting and\nrepresenting the historical context is crucial in the modelling of empathy and\nemotion in conversations. In our submissions, we model empathy, emotion\npolarity and emotion intensity of each utterance in a conversation by feeding\nthe utterance to be classified together with its conversational context, i.e.,\na certain number of previous conversational turns, as input to an encoder\nPre-trained Language Model, to which we append a regression head for\nprediction. We also model perceived counterparty empathy of each interlocutor\nby feeding all utterances from the conversation and a token identifying the\ninterlocutor for which we are predicting the empathy. Our system officially\nranked $1^{st}$ at the CONV-turn track and $2^{nd}$ at the CONV-dialog track.\n","authors":["Patrícia Pereira","Helena Moniz","Joao Paulo Carvalho"],"pdf_url":"https://arxiv.org/pdf/2407.03818v1.pdf","comment":"WASSA'24"},{"id":"http://arxiv.org/abs/2407.03809v1","updated":"2024-07-04T10:33:12Z","published":"2024-07-04T10:33:12Z","title":"Finetuning End-to-End Models for Estonian Conversational Spoken Language\n Translation","summary":" This paper investigates the finetuning of end-to-end models for bidirectional\nEstonian-English and Estonian-Russian conversational speech-to-text\ntranslation. Due to the limited availability of speech translation data for\nEstonian, we created additional training data by web scraping and synthesizing\ndata from speech recognition datasets using machine translation. We evaluated\nthree publicly available end-to-end models: Whisper, OWSM 3.1, and SeamlessM4T.\nOur results indicate that fine-tuning with synthetic data enhances translation\naccuracy by a large margin, with SeamlessM4T matching or surpassing cascaded\nspeech translation systems that use state-of-the-art speech recognition and\nmachine translation models.\n","authors":["Tiia Sildam","Andra Velve","Tanel Alumäe"],"pdf_url":"https://arxiv.org/pdf/2407.03809v1.pdf","comment":"Accepted to LoResMT 2024 (ACL workshop)"},{"id":"http://arxiv.org/abs/2407.03805v1","updated":"2024-07-04T10:28:48Z","published":"2024-07-04T10:28:48Z","title":"Cognitive Modeling with Scaffolded LLMs: A Case Study of Referential\n Expression Generation","summary":" To what extent can LLMs be used as part of a cognitive model of language\ngeneration? In this paper, we approach this question by exploring a\nneuro-symbolic implementation of an algorithmic cognitive model of referential\nexpression generation by Dale & Reiter (1995). The symbolic task analysis\nimplements the generation as an iterative procedure that scaffolds symbolic and\ngpt-3.5-turbo-based modules. We compare this implementation to an ablated model\nand a one-shot LLM-only baseline on the A3DS dataset (Tsvilodub & Franke,\n2023). We find that our hybrid approach is cognitively plausible and performs\nwell in complex contexts, while allowing for more open-ended modeling of\nlanguage generation in a larger domain.\n","authors":["Polina Tsvilodub","Michael Franke","Fausto Carcassi"],"pdf_url":"https://arxiv.org/pdf/2407.03805v1.pdf","comment":"11 pages, 3 figures, 2 algorithms, to appear at the ICML 2024\n workshop on Large Language Models and Cognition"},{"id":"http://arxiv.org/abs/2311.09132v2","updated":"2024-07-04T10:16:35Z","published":"2023-11-15T17:21:58Z","title":"Aligning Neural Machine Translation Models: Human Feedback in Training\n and Inference","summary":" Reinforcement learning from human feedback (RLHF) is a recent technique to\nimprove the quality of the text generated by a language model, making it closer\nto what humans would generate. A core ingredient in RLHF's success in aligning\nand improving large language models (LLMs) is its reward model, trained using\nhuman feedback on model outputs. In machine translation (MT), where metrics\ntrained from human annotations can readily be used as reward models, recent\nmethods using minimum Bayes risk decoding and reranking have succeeded in\nimproving the final quality of translation. In this study, we comprehensively\nexplore and compare techniques for integrating quality metrics as reward models\ninto the MT pipeline. This includes using the reward model for data filtering,\nduring the training phase through RL, and at inference time by employing\nreranking techniques, and we assess the effects of combining these in a unified\napproach. Our experimental results, conducted across multiple translation\ntasks, underscore the crucial role of effective data filtering, based on\nestimated quality, in harnessing the full potential of RL in enhancing MT\nquality. Furthermore, our findings demonstrate the effectiveness of combining\nRL training with reranking techniques, showcasing substantial improvements in\ntranslation quality.\n","authors":["Miguel Moura Ramos","Patrick Fernandes","António Farinhas","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2311.09132v2.pdf","comment":"EAMT 2024"},{"id":"http://arxiv.org/abs/2403.02130v3","updated":"2024-07-04T10:12:38Z","published":"2024-03-04T15:39:59Z","title":"Using LLMs for the Extraction and Normalization of Product Attribute\n Values","summary":" Product offers on e-commerce websites often consist of a product title and a\ntextual product description. In order to enable features such as faceted\nproduct search or to generate product comparison tables, it is necessary to\nextract structured attribute-value pairs from the unstructured product titles\nand descriptions and to normalize the extracted values to a single, unified\nscale for each attribute. This paper explores the potential of using large\nlanguage models (LLMs), such as GPT-3.5 and GPT-4, to extract and normalize\nattribute values from product titles and descriptions. We experiment with\ndifferent zero-shot and few-shot prompt templates for instructing LLMs to\nextract and normalize attribute-value pairs. We introduce the Web Data Commons\n- Product Attribute Value Extraction (WDC-PAVE) benchmark dataset for our\nexperiments. WDC-PAVE consists of product offers from 59 different websites\nwhich provide schema.org annotations. The offers belong to five different\nproduct categories, each with a specific set of attributes. The dataset\nprovides manually verified attribute-value pairs in two forms: (i) directly\nextracted values and (ii) normalized attribute values. The normalization of the\nattribute values requires systems to perform the following types of operations:\nname expansion, generalization, unit of measurement conversion, and string\nwrangling. Our experiments demonstrate that GPT-4 outperforms the PLM-based\nextraction methods SU-OpenTag, AVEQA, and MAVEQA by 10%, achieving an F1-score\nof 91%. For the extraction and normalization of product attribute values, GPT-4\nachieves a similar performance to the extraction scenario, while being\nparticularly strong at string wrangling and name expansion.\n","authors":["Alexander Brinkmann","Nick Baumann","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2403.02130v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03791v1","updated":"2024-07-04T09:55:04Z","published":"2024-07-04T09:55:04Z","title":"M$\\mathbf5$ -- A Diverse Benchmark to Assess the Performance of Large\n Multimodal Models Across Multilingual and Multicultural Vision-Language Tasks","summary":" Since the release of ChatGPT, the field of Natural Language Processing has\nexperienced rapid advancements, particularly in Large Language Models (LLMs)\nand their multimodal counterparts, Large Multimodal Models (LMMs). Despite\ntheir impressive capabilities, LLMs often exhibit significant performance\ndisparities across different languages and cultural contexts, as demonstrated\nby various text-only benchmarks. However, current research lacks such\nbenchmarks for multimodal visio-linguistic settings. This work fills this gap\nby introducing M5, the first comprehensive benchmark designed to evaluate LMMs\non diverse vision-language tasks within a multilingual and multicultural\ncontext. M5 includes eight datasets covering five tasks and $41$ languages,\nwith a focus on underrepresented languages and culturally diverse images.\nFurthermore, we introduce two novel datasets, M5-VGR and M5-VLOD, including a\nnew Visio-Linguistic Outlier Detection task, in which all evaluated open-source\nmodels fail to significantly surpass the random baseline. Through extensive\nevaluation and analyses, we highlight substantial task-agnostic performance\ndisparities between high- and low-resource languages. Moreover, we show that\nlarger models do not necessarily outperform smaller ones in a multilingual\nsetting.\n","authors":["Florian Schneider","Sunayana Sitaram"],"pdf_url":"https://arxiv.org/pdf/2407.03791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03788v1","updated":"2024-07-04T09:52:17Z","published":"2024-07-04T09:52:17Z","title":"Meta-optimized Angular Margin Contrastive Framework for Video-Language\n Representation Learning","summary":" Data quality stands at the forefront of deciding the effectiveness of\nvideo-language representation learning. However, video-text pairs in previous\ndata typically do not align perfectly with each other, which might lead to\nvideo-language representations that do not accurately reflect cross-modal\nsemantics. Moreover, previous data also possess an uneven distribution of\nconcepts, thereby hampering the downstream performance across unpopular\nsubjects. To address these problems, we propose a contrastive objective with a\nsubtractive angular margin to regularize cross-modal representations in their\neffort to reach perfect similarity. Furthermore, to adapt to the non-uniform\nconcept distribution, we propose a multi-layer perceptron (MLP)-parameterized\nweighting function that maps loss values to sample weights which enable dynamic\nadjustment of the model's focus throughout the training. With the training\nguided by a small amount of unbiased meta-data and augmented by video-text data\ngenerated by large vision-language model, we improve video-language\nrepresentations and achieve superior performances on commonly used video\nquestion answering and text-video retrieval datasets.\n","authors":["Thong Nguyen","Yi Bin","Xiaobao Wu","Xinshuai Dong","Zhiyuan Hu","Khoi Le","Cong-Duy Nguyen","See-Kiong Ng","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2407.03788v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2407.03779v1","updated":"2024-07-04T09:42:25Z","published":"2024-07-04T09:42:25Z","title":"Functional Faithfulness in the Wild: Circuit Discovery with\n Differentiable Computation Graph Pruning","summary":" In this paper, we introduce a comprehensive reformulation of the task known\nas Circuit Discovery, along with DiscoGP, a novel and effective algorithm based\non differentiable masking for discovering circuits. Circuit discovery is the\ntask of interpreting the computational mechanisms of language models (LMs) by\ndissecting their functions and capabilities into sparse subnetworks (circuits).\nWe identified two major limitations in existing circuit discovery efforts: (1)\na dichotomy between weight-based and connection-edge-based approaches forces\nresearchers to choose between pruning connections or weights, thereby limiting\nthe scope of mechanistic interpretation of LMs; (2) algorithms based on\nactivation patching tend to identify circuits that are neither functionally\nfaithful nor complete. The performance of these identified circuits is\nsubstantially reduced, often resulting in near-random performance in isolation.\nFurthermore, the complement of the circuit -- i.e., the original LM with the\nidentified circuit removed -- still retains adequate performance, indicating\nthat essential components of a complete circuits are missed by existing\nmethods.\n DiscoGP successfully addresses the two aforementioned issues and demonstrates\nstate-of-the-art faithfulness, completeness, and sparsity. The effectiveness of\nthe algorithm and its novel structure open up new avenues of gathering new\ninsights into the internal workings of generative AI.\n","authors":["Lei Yu","Jingcheng Niu","Zining Zhu","Gerald Penn"],"pdf_url":"https://arxiv.org/pdf/2407.03779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03770v1","updated":"2024-07-04T09:29:19Z","published":"2024-07-04T09:29:19Z","title":"HYBRINFOX at CheckThat! 2024 -- Task 2: Enriching BERT Models with the\n Expert System VAGO for Subjectivity Detection","summary":" This paper presents the HYBRINFOX method used to solve Task 2 of Subjectivity\ndetection of the CLEF 2024 CheckThat! competition. The specificity of the\nmethod is to use a hybrid system, combining a RoBERTa model, fine-tuned for\nsubjectivity detection, a frozen sentence-BERT (sBERT) model to capture\nsemantics, and several scores calculated by the English version of the expert\nsystem VAGO, developed independently of this task to measure vagueness and\nsubjectivity in texts based on the lexicon. In English, the HYBRINFOX method\nranked 1st with a macro F1 score of 0.7442 on the evaluation data. For the\nother languages, the method used a translation step into English, producing\nmore mixed results (ranking 1st in Multilingual and 2nd in Italian over the\nbaseline, but under the baseline in Bulgarian, German, and Arabic). We explain\nthe principles of our hybrid approach, and outline ways in which the method\ncould be improved for other languages besides English.\n","authors":["Morgane Casanova","Julien Chanson","Benjamin Icard","Géraud Faye","Guillaume Gadek","Guillaume Gravier","Paul Égré"],"pdf_url":"https://arxiv.org/pdf/2407.03770v1.pdf","comment":"To appear in the Proceedings of the Conference and Labs of the\n Evaluation Forum (CLEF 2024 CheckThat!)"},{"id":"http://arxiv.org/abs/2404.06709v2","updated":"2024-07-04T09:17:15Z","published":"2024-04-10T03:30:01Z","title":"CQIL: Inference Latency Optimization with Concurrent Computation of\n Quasi-Independent Layers","summary":" The fast-growing large scale language models are delivering unprecedented\nperformance on almost all natural language processing tasks. However, the\neffectiveness of large language models are reliant on an exponentially\nincreasing number of parameters. The overwhelming computation complexity incurs\na high inference latency that negatively affects user experience. Existing\nmethods to improve inference efficiency, such as tensor parallelism and\nquantization, target to reduce per-layer computing latency, yet overlook the\ncumulative latency due to the number of layers. Recent works on reducing the\ncumulative latency through layer removing, however, lead to significant\nperformance drop. Motivated by the similarity of inputs among adjacent layers,\nwe propose to identify quasi-independent layers, which can be concurrently\ncomputed to significantly decrease inference latency. We also introduce a\nbypassing technique to mitigate the effect of information loss. Empirical\nexperiments of the proposed approach on the LLaMA models confirm that\nConcurrent Computation of Quasi-Independent Layers (CQIL) can reduce latency by\nup to 48.3% on LLaMA-33B, while maintaining a close level of performance.\n","authors":["Longwei Zou","Qingyang Wang","Han Zhao","Jiangang Kong","Yi Yang","Yangdong Deng"],"pdf_url":"https://arxiv.org/pdf/2404.06709v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2303.16166v5","updated":"2024-07-04T09:16:22Z","published":"2023-03-28T17:28:52Z","title":"When Good and Reproducible Results are a Giant with Feet of Clay: The\n Importance of Software Quality in NLP","summary":" Despite its crucial role in research experiments, code correctness is often\npresumed only on the basis of the perceived quality of results. This assumption\ncomes with the risk of erroneous outcomes and potentially misleading findings.\nTo address this issue, we posit that the current focus on reproducibility\nshould go hand in hand with the emphasis on software quality. We present a case\nstudy in which we identify and fix three bugs in widely used implementations of\nthe state-of-the-art Conformer architecture. Through experiments on speech\nrecognition and translation in various languages, we demonstrate that the\npresence of bugs does not prevent the achievement of good and reproducible\nresults, which however can lead to incorrect conclusions that potentially\nmisguide future research. As a countermeasure, we propose a Code-quality\nChecklist and release pangoliNN, a library dedicated to testing neural models,\nwith the goal of promoting coding best practices and improving research\nsoftware quality within the NLP community.\n","authors":["Sara Papi","Marco Gaido","Andrea Pilzer","Matteo Negri"],"pdf_url":"https://arxiv.org/pdf/2303.16166v5.pdf","comment":"Accepted at ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2402.06341v2","updated":"2024-07-04T09:10:17Z","published":"2024-02-09T11:34:16Z","title":"RareBench: Can LLMs Serve as Rare Diseases Specialists?","summary":" Generalist Large Language Models (LLMs), such as GPT-4, have shown\nconsiderable promise in various domains, including medical diagnosis. Rare\ndiseases, affecting approximately 300 million people worldwide, often have\nunsatisfactory clinical diagnosis rates primarily due to a lack of experienced\nphysicians and the complexity of differentiating among many rare diseases. In\nthis context, recent news such as \"ChatGPT correctly diagnosed a 4-year-old's\nrare disease after 17 doctors failed\" underscore LLMs' potential, yet\nunderexplored, role in clinically diagnosing rare diseases. To bridge this\nresearch gap, we introduce RareBench, a pioneering benchmark designed to\nsystematically evaluate the capabilities of LLMs on 4 critical dimensions\nwithin the realm of rare diseases. Meanwhile, we have compiled the largest\nopen-source dataset on rare disease patients, establishing a benchmark for\nfuture studies in this domain. To facilitate differential diagnosis of rare\ndiseases, we develop a dynamic few-shot prompt methodology, leveraging a\ncomprehensive rare disease knowledge graph synthesized from multiple knowledge\nbases, significantly enhancing LLMs' diagnostic performance. Moreover, we\npresent an exhaustive comparative study of GPT-4's diagnostic capabilities\nagainst those of specialist physicians. Our experimental findings underscore\nthe promising potential of integrating LLMs into the clinical diagnostic\nprocess for rare diseases. This paves the way for exciting possibilities in\nfuture advancements in this field.\n","authors":["Xuanzhong Chen","Xiaohao Mao","Qihan Guo","Lun Wang","Shuyang Zhang","Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2402.06341v2.pdf","comment":"KDD2024"},{"id":"http://arxiv.org/abs/2407.02395v2","updated":"2024-07-04T08:59:31Z","published":"2024-07-02T16:13:21Z","title":"Is Your AI-Generated Code Really Safe? Evaluating Large Language Models\n on Secure Code Generation with CodeSecEval","summary":" Large language models (LLMs) have brought significant advancements to code\ngeneration and code repair, benefiting both novice and experienced developers.\nHowever, their training using unsanitized data from open-source repositories,\nlike GitHub, raises the risk of inadvertently propagating security\nvulnerabilities. Despite numerous studies investigating the safety of code\nLLMs, there remains a gap in comprehensively addressing their security\nfeatures. In this work, we aim to present a comprehensive study aimed at\nprecisely evaluating and enhancing the security aspects of code LLMs. To\nsupport our research, we introduce CodeSecEval, a meticulously curated dataset\ndesigned to address 44 critical vulnerability types with 180 distinct samples.\nCodeSecEval serves as the foundation for the automatic evaluation of code\nmodels in two crucial tasks: code generation and code repair, with a strong\nemphasis on security. Our experimental results reveal that current models\nfrequently overlook security issues during both code generation and repair\nprocesses, resulting in the creation of vulnerable code. In response, we\npropose different strategies that leverage vulnerability-aware information and\ninsecure code explanations to mitigate these security vulnerabilities.\nFurthermore, our findings highlight that certain vulnerability types\nparticularly challenge model performance, influencing their effectiveness in\nreal-world applications. Based on these findings, we believe our study will\nhave a positive impact on the software engineering community, inspiring the\ndevelopment of improved methods for training and utilizing LLMs, thereby\nleading to safer and more trustworthy model deployment.\n","authors":["Jiexin Wang","Xitong Luo","Liuwen Cao","Hongkui He","Hailin Huang","Jiayuan Xie","Adam Jatowt","Yi Cai"],"pdf_url":"https://arxiv.org/pdf/2407.02395v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2310.16263"},{"id":"http://arxiv.org/abs/2407.03748v1","updated":"2024-07-04T08:59:17Z","published":"2024-07-04T08:59:17Z","title":"Argument Mining in Data Scarce Settings: Cross-lingual Transfer and\n Few-shot Techniques","summary":" Recent research on sequence labelling has been exploring different strategies\nto mitigate the lack of manually annotated data for the large majority of the\nworld languages. Among others, the most successful approaches have been based\non (i) the cross-lingual transfer capabilities of multilingual pre-trained\nlanguage models (model-transfer), (ii) data translation and label projection\n(data-transfer) and (iii), prompt-based learning by reusing the mask objective\nto exploit the few-shot capabilities of pre-trained language models (few-shot).\nPrevious work seems to conclude that model-transfer outperforms data-transfer\nmethods and that few-shot techniques based on prompting are superior to\nupdating the model's weights via fine-tuning. In this paper, we empirically\ndemonstrate that, for Argument Mining, a sequence labelling task which requires\nthe detection of long and complex discourse structures, previous insights on\ncross-lingual transfer or few-shot learning do not apply. Contrary to previous\nwork, we show that for Argument Mining data transfer obtains better results\nthan model-transfer and that fine-tuning outperforms few-shot methods.\nRegarding the former, the domain of the dataset used for data-transfer seems to\nbe a deciding factor, while, for few-shot, the type of task (length and\ncomplexity of the sequence spans) and sampling method prove to be crucial.\n","authors":["Anar Yeginbergen","Maite Oronoz","Rodrigo Agerri"],"pdf_url":"https://arxiv.org/pdf/2407.03748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03734v1","updated":"2024-07-04T08:33:52Z","published":"2024-07-04T08:33:52Z","title":"Improving Self-supervised Pre-training using Accent-Specific Codebooks","summary":" Speech accents present a serious challenge to the performance of\nstate-of-the-art end-to-end Automatic Speech Recognition (ASR) systems. Even\nwith self-supervised learning and pre-training of ASR models, accent invariance\nis seldom achieved. In this work, we propose an accent-aware adaptation\ntechnique for self-supervised learning that introduces a trainable set of\naccent-specific codebooks to the self-supervised architecture. These learnable\ncodebooks enable the model to capture accent specific information during\npre-training, that is further refined during ASR finetuning. On the Mozilla\nCommon Voice dataset, our proposed approach outperforms all other\naccent-adaptation approaches on both seen and unseen English accents, with up\nto 9% relative reduction in word error rate (WER).\n","authors":["Darshan Prabhu","Abhishek Gupta","Omkar Nitsure","Preethi Jyothi","Sriram Ganapathy"],"pdf_url":"https://arxiv.org/pdf/2407.03734v1.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2407.03720v1","updated":"2024-07-04T08:08:33Z","published":"2024-07-04T08:08:33Z","title":"Query-oriented Data Augmentation for Session Search","summary":" Modeling contextual information in a search session has drawn more and more\nattention when understanding complex user intents. Recent methods are all\ndata-driven, i.e., they train different models on large-scale search log data\nto identify the relevance between search contexts and candidate documents. The\ncommon training paradigm is to pair the search context with different candidate\ndocuments and train the model to rank the clicked documents higher than the\nunclicked ones. However, this paradigm neglects the symmetric nature of the\nrelevance between the session context and document, i.e., the clicked documents\ncan also be paired with different search contexts when training. In this work,\nwe propose query-oriented data augmentation to enrich search logs and empower\nthe modeling. We generate supplemental training pairs by altering the most\nimportant part of a search context, i.e., the current query, and train our\nmodel to rank the generated sequence along with the original sequence. This\napproach enables models to learn that the relevance of a document may vary as\nthe session context changes, leading to a better understanding of users' search\npatterns. We develop several strategies to alter the current query, resulting\nin new training data with varying degrees of difficulty. Through\nexperimentation on two extensive public search logs, we have successfully\ndemonstrated the effectiveness of our model.\n","authors":["Haonan Chen","Zhicheng Dou","Yutao Zhu","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2407.03720v1.pdf","comment":"TKDE 2024"},{"id":"http://arxiv.org/abs/2407.03718v1","updated":"2024-07-04T08:08:12Z","published":"2024-07-04T08:08:12Z","title":"Multi-Convformer: Extending Conformer with Multiple Convolution Kernels","summary":" Convolutions have become essential in state-of-the-art end-to-end Automatic\nSpeech Recognition~(ASR) systems due to their efficient modelling of local\ncontext. Notably, its use in Conformers has led to superior performance\ncompared to vanilla Transformer-based ASR systems. While components other than\nthe convolution module in the Conformer have been reexamined, altering the\nconvolution module itself has been far less explored. Towards this, we\nintroduce Multi-Convformer that uses multiple convolution kernels within the\nconvolution module of the Conformer in conjunction with gating. This helps in\nimproved modeling of local dependencies at varying granularities. Our model\nrivals existing Conformer variants such as CgMLP and E-Branchformer in\nperformance, while being more parameter efficient. We empirically compare our\napproach with Conformer and its variants across four different datasets and\nthree different modelling paradigms and show up to 8% relative word error\nrate~(WER) improvements.\n","authors":["Darshan Prabhu","Yifan Peng","Preethi Jyothi","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2407.03718v1.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2403.17104v3","updated":"2024-07-04T08:07:58Z","published":"2024-03-25T18:41:47Z","title":"Attribute First, then Generate: Locally-attributable Grounded Text\n Generation","summary":" Recent efforts to address hallucinations in Large Language Models (LLMs) have\nfocused on attributed text generation, which supplements generated texts with\ncitations of supporting sources for post-generation fact-checking and\ncorrections. Yet, these citations often point to entire documents or\nparagraphs, burdening users with extensive verification work. In this paper, we\nintroduce a locally-attributable text generation approach, prioritizing concise\nattributions. Our method, named \"Attribute First, then Generate\", breaks down\nthe conventional end-to-end generation process into three intuitive steps:\ncontent selection, sentence planning, and sequential sentence generation. By\ninitially identifying relevant source segments (\"select first\") and then\nconditioning the generation process on them (\"then generate\"), we ensure these\nsegments also act as the output's fine-grained attributions (\"select\" becomes\n\"attribute\"). Tested on Multi-document Summarization and Long-form\nQuestion-answering, our method not only yields more concise citations than the\nbaselines but also maintains - and in some cases enhances - both generation\nquality and attribution accuracy. Furthermore, it significantly reduces the\ntime required for fact verification by human assessors.\n","authors":["Aviv Slobodkin","Eran Hirsch","Arie Cattan","Tal Schuster","Ido Dagan"],"pdf_url":"https://arxiv.org/pdf/2403.17104v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2403.05881v3","updated":"2024-07-04T07:45:07Z","published":"2024-03-09T11:23:38Z","title":"KG-Rank: Enhancing Large Language Models for Medical QA with Knowledge\n Graphs and Ranking Techniques","summary":" Large language models (LLMs) have demonstrated impressive generative\ncapabilities with the potential to innovate in medicine. However, the\napplication of LLMs in real clinical settings remains challenging due to the\nlack of factual consistency in the generated content. In this work, we develop\nan augmented LLM framework, KG-Rank, which leverages a medical knowledge graph\n(KG) along with ranking and re-ranking techniques, to improve the factuality of\nlong-form question answering (QA) in the medical domain. Specifically, when\nreceiving a question, KG-Rank automatically identifies medical entities within\nthe question and retrieves the related triples from the medical KG to gather\nfactual information. Subsequently, KG-Rank innovatively applies multiple\nranking techniques to refine the ordering of these triples, providing more\nrelevant and precise information for LLM inference. To the best of our\nknowledge, KG-Rank is the first application of KG combined with ranking models\nin medical QA specifically for generating long answers. Evaluation on four\nselected medical QA datasets demonstrates that KG-Rank achieves an improvement\nof over 18% in ROUGE-L score. Additionally, we extend KG-Rank to open domains,\nincluding law, business, music, and history, where it realizes a 14%\nimprovement in ROUGE-L score, indicating the effectiveness and great potential\nof KG-Rank.\n","authors":["Rui Yang","Haoran Liu","Edison Marrese-Taylor","Qingcheng Zeng","Yu He Ke","Wanxin Li","Lechao Cheng","Qingyu Chen","James Caverlee","Yutaka Matsuo","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2403.05881v3.pdf","comment":"12 pages, 9 figures, 8 tables"},{"id":"http://arxiv.org/abs/2404.13846v3","updated":"2024-07-04T07:40:53Z","published":"2024-04-22T03:05:19Z","title":"Filtered Direct Preference Optimization","summary":" Reinforcement learning from human feedback (RLHF) plays a crucial role in\naligning language models with human preferences. While the significance of\ndataset quality is generally recognized, explicit investigations into its\nimpact within the RLHF framework, to our knowledge, have been limited. This\npaper addresses the issue of text quality within the preference dataset by\nfocusing on direct preference optimization (DPO), an increasingly adopted\nreward-model-free RLHF method. We confirm that text quality significantly\ninfluences the performance of models optimized with DPO more than those\noptimized with reward-model-based RLHF. Building on this new insight, we\npropose an extension of DPO, termed filtered direct preference optimization\n(fDPO). fDPO uses a trained reward model to monitor the quality of texts within\nthe preference dataset during DPO training. Samples of lower quality are\ndiscarded based on comparisons with texts generated by the model being\noptimized, resulting in a more accurate dataset. Experimental results\ndemonstrate that fDPO enhances the final model performance. Our code is\navailable at https://github.com/CyberAgentAILab/filtered-dpo.\n","authors":["Tetsuro Morimura","Mitsuki Sakamoto","Yuu Jinnai","Kenshi Abe","Kaito Ariu"],"pdf_url":"https://arxiv.org/pdf/2404.13846v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03689v1","updated":"2024-07-04T07:21:38Z","published":"2024-07-04T07:21:38Z","title":"Text2TimeSeries: Enhancing Financial Forecasting through Time Series\n Prediction Updates with Event-Driven Insights from Large Language Models","summary":" Time series models, typically trained on numerical data, are designed to\nforecast future values. These models often rely on weighted averaging\ntechniques over time intervals. However, real-world time series data is seldom\nisolated and is frequently influenced by non-numeric factors. For instance,\nstock price fluctuations are impacted by daily random events in the broader\nworld, with each event exerting a unique influence on price signals.\nPreviously, forecasts in financial markets have been approached in two main\nways: either as time-series problems over price sequence or sentiment analysis\ntasks. The sentiment analysis tasks aim to determine whether news events will\nhave a positive or negative impact on stock prices, often categorizing them\ninto discrete labels. Recognizing the need for a more comprehensive approach to\naccurately model time series prediction, we propose a collaborative modeling\nframework that incorporates textual information about relevant events for\npredictions. Specifically, we leverage the intuition of large language models\nabout future changes to update real number time series predictions. We\nevaluated the effectiveness of our approach on financial market data.\n","authors":["Litton Jose Kurisinkel","Pruthwik Mishra","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.03689v1.pdf","comment":"21 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.03687v1","updated":"2024-07-04T07:17:53Z","published":"2024-07-04T07:17:53Z","title":"STOC-TOT: Stochastic Tree-of-Thought with Constrained Decoding for\n Complex Reasoning in Multi-Hop Question Answering","summary":" Multi-hop question answering (MHQA) requires a model to retrieve and\nintegrate information from multiple passages to answer a complex question.\nRecent systems leverage the power of large language models and integrate\nevidence retrieval with reasoning prompts (e.g., chain-of-thought reasoning)\nfor the MHQA task. However, the complexities in the question types (bridge v.s.\ncomparison questions) and the reasoning types (sequential v.s. parallel\nreasonings) require more novel and fine-grained prompting methods to enhance\nthe performance of MHQA under the zero-shot setting. In this paper, we propose\nSTOC-TOT, a stochastic tree-of-thought reasoning prompting method with\nconstrained decoding for MHQA and conduct a detailed comparison with other\nreasoning prompts on different question types and reasoning types.\nSpecifically, we construct a tree-like reasoning structure by prompting the\nmodel to break down the original question into smaller sub-questions to form\ndifferent reasoning paths. In addition, we prompt the model to provide a\nprobability estimation for each reasoning path at each reasoning step. At\nanswer time, we conduct constrained decoding on the model to generate more\ngrounded answers and reduce hallucination. Experiments comparing STOC-TOT with\ntwo MHQA datasets and five large language models showed that our framework\noutperforms other reasoning prompts by a significant margin.\n","authors":["Zhenyu Bi","Daniel Hajialigol","Zhongkai Sun","Jie Hao","Xuan Wang"],"pdf_url":"https://arxiv.org/pdf/2407.03687v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.04691v4","updated":"2024-07-04T07:05:48Z","published":"2023-12-07T20:42:05Z","title":"Simul-LLM: A Framework for Exploring High-Quality Simultaneous\n Translation with Large Language Models","summary":" Large language models (LLMs) with billions of parameters and pretrained on\nmassive amounts of data are now capable of near or better than state-of-the-art\nperformance in a variety of downstream natural language processing tasks.\nNeural machine translation (NMT) is one such task that LLMs have been applied\nto with great success. However, little research has focused on applying LLMs to\nthe more difficult subset of NMT called simultaneous translation (SimulMT),\nwhere translation begins before the entire source context is available to the\nmodel. In this paper, we address key challenges facing LLMs fine-tuned for\nSimulMT, validate classical SimulMT concepts and practices in the context of\nLLMs, explore adapting LLMs that are fine-tuned for NMT to the task of SimulMT,\nand introduce Simul-LLM, the first open-source fine-tuning and evaluation\npipeline development framework for LLMs focused on SimulMT.\n","authors":["Victor Agostinelli","Max Wild","Matthew Raffel","Kazi Ahmed Asif Fuad","Lizhong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.04691v4.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.03678v1","updated":"2024-07-04T06:52:48Z","published":"2024-07-04T06:52:48Z","title":"Improving Self Consistency in LLMs through Probabilistic Tokenization","summary":" Prior research has demonstrated noticeable performance gains through the use\nof probabilistic tokenizations, an approach that involves employing multiple\ntokenizations of the same input string during the training phase of a language\nmodel. Despite these promising findings, modern large language models (LLMs)\nhave yet to be trained using probabilistic tokenizations. Interestingly, while\nthe tokenizers of these contemporary LLMs have the capability to generate\nmultiple tokenizations, this property remains underutilized.\n In this work, we propose a novel method to leverage the multiple tokenization\ncapabilities of modern LLM tokenizers, aiming to enhance the self-consistency\nof LLMs in reasoning tasks. Our experiments indicate that when utilizing\nprobabilistic tokenizations, LLMs generate logically diverse reasoning paths,\nmoving beyond mere surface-level linguistic diversity.We carefully study\nprobabilistic tokenization and offer insights to explain the self consistency\nimprovements it brings through extensive experimentation on 5 LLM families and\n4 reasoning benchmarks.\n","authors":["Ashutosh Sathe","Divyanshu Aggarwal","Sunayana Sitaram"],"pdf_url":"https://arxiv.org/pdf/2407.03678v1.pdf","comment":"ICML 2024 Workshop on LLMs and Cognition"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.14587v2","updated":"2024-07-04T23:42:09Z","published":"2024-01-26T01:29:37Z","title":"CNG-SFDA: Clean-and-Noisy Region Guided Online-Offline Source-Free\n Domain Adaptation","summary":" Domain shift occurs when training (source) and test (target) data diverge in\ntheir distribution. Source-Free Domain Adaptation (SFDA) addresses this domain\nshift problem, aiming to adopt a trained model on the source domain to the\ntarget domain in a scenario where only a well-trained source model and\nunlabeled target data are available. In this scenario, handling false labels in\nthe target domain is crucial because they negatively impact the model\nperformance. To deal with this problem, we propose to update cluster prototypes\n(i.e., centroid of each sample cluster) and their structure in the target\ndomain formulated by the source model in online manners. In the feature space,\nsamples in different regions have different pseudo-label distribution\ncharacteristics affected by the cluster prototypes, and we adopt distinct\ntraining strategies for these samples by defining clean and noisy regions: we\nselectively train the target with clean pseudo-labels in the clean region,\nwhereas we introduce mix-up inputs representing intermediate features between\nclean and noisy regions to increase the compactness of the cluster. We\nconducted extensive experiments on multiple datasets in online/offline SFDA\nsettings, whose results demonstrate that our method, CNG-SFDA, achieves\nstate-of-the-art for most cases.\n","authors":["Hyeonwoo Cho","Chanmin Park","Donghee Kim","Jinyoung Kim","Won Hwa Kim"],"pdf_url":"https://arxiv.org/pdf/2401.14587v2.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.04184v1","updated":"2024-07-04T23:13:06Z","published":"2024-07-04T23:13:06Z","title":"QueryMamba: A Mamba-Based Encoder-Decoder Architecture with a\n Statistical Verb-Noun Interaction Module for Video Action Forecasting @ Ego4D\n Long-Term Action Anticipation Challenge 2024","summary":" This report presents a novel Mamba-based encoder-decoder architecture,\nQueryMamba, featuring an integrated verb-noun interaction module that utilizes\na statistical verb-noun co-occurrence matrix to enhance video action\nforecasting. This architecture not only predicts verbs and nouns likely to\noccur based on historical data but also considers their joint occurrence to\nimprove forecast accuracy. The efficacy of this approach is substantiated by\nexperimental results, with the method achieving second place in the Ego4D LTA\nchallenge and ranking first in noun prediction accuracy.\n","authors":["Zeyun Zhong","Manuel Martin","Frederik Diederichs","Juergen Beyerer"],"pdf_url":"https://arxiv.org/pdf/2407.04184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04180v1","updated":"2024-07-04T22:52:25Z","published":"2024-07-04T22:52:25Z","title":"Slice-100K: A Multimodal Dataset for Extrusion-based 3D Printing","summary":" G-code (Geometric code) or RS-274 is the most widely used computer numerical\ncontrol (CNC) and 3D printing programming language. G-code provides machine\ninstructions for the movement of the 3D printer, especially for the nozzle,\nstage, and extrusion of material for extrusion-based additive manufacturing.\nCurrently there does not exist a large repository of curated CAD models along\nwith their corresponding G-code files for additive manufacturing. To address\nthis issue, we present SLICE-100K, a first-of-its-kind dataset of over 100,000\nG-code files, along with their tessellated CAD model, LVIS (Large Vocabulary\nInstance Segmentation) categories, geometric properties, and renderings. We\nbuild our dataset from triangulated meshes derived from Objaverse-XL and\nThingi10K datasets. We demonstrate the utility of this dataset by finetuning\nGPT-2 on a subset of the dataset for G-code translation from a legacy G-code\nformat (Sailfish) to a more modern, widely used format (Marlin). SLICE-100K\nwill be the first step in developing a multimodal foundation model for digital\nmanufacturing.\n","authors":["Anushrut Jignasu","Kelly O. Marshall","Ankush Kumar Mishra","Lucas Nerone Rillo","Baskar Ganapathysubramanian","Aditya Balu","Chinmay Hegde","Adarsh Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2407.04180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04170v1","updated":"2024-07-04T22:09:01Z","published":"2024-07-04T22:09:01Z","title":"Attention Normalization Impacts Cardinality Generalization in Slot\n Attention","summary":" Object-centric scene decompositions are important representations for\ndownstream tasks in fields such as computer vision and robotics. The recently\nproposed Slot Attention module, already leveraged by several derivative works\nfor image segmentation and object tracking in videos, is a deep learning\ncomponent which performs unsupervised object-centric scene decomposition on\ninput images. It is based on an attention architecture, in which latent slot\nvectors, which hold compressed information on objects, attend to localized\nperceptual features from the input image. In this paper, we show that design\ndecisions on normalizing the aggregated values in the attention architecture\nhave considerable impact on the capabilities of Slot Attention to generalize to\na higher number of slots and objects as seen during training. We argue that the\noriginal Slot Attention normalization scheme discards information on the prior\nassignment probability of pixels to slots, which impairs its generalization\ncapabilities. Based on these findings, we propose and investigate alternative\nnormalization approaches which increase the generalization capabilities of Slot\nAttention to varying slot and object counts, resulting in performance gains on\nthe task of unsupervised image segmentation.\n","authors":["Markus Krimmel","Jan Achterhold","Joerg Stueckler"],"pdf_url":"https://arxiv.org/pdf/2407.04170v1.pdf","comment":"24 pages, 10 figures, 5 tables"},{"id":"http://arxiv.org/abs/2407.04169v1","updated":"2024-07-04T22:01:21Z","published":"2024-07-04T22:01:21Z","title":"Solutions to Deepfakes: Can Camera Hardware, Cryptography, and Deep\n Learning Verify Real Images?","summary":" The exponential progress in generative AI poses serious implications for the\ncredibility of all real images and videos. There will exist a point in the\nfuture where 1) digital content produced by generative AI will be\nindistinguishable from those created by cameras, 2) high-quality generative\nalgorithms will be accessible to anyone, and 3) the ratio of all synthetic to\nreal images will be large. It is imperative to establish methods that can\nseparate real data from synthetic data with high confidence. We define real\nimages as those that were produced by the camera hardware, capturing a\nreal-world scene. Any synthetic generation of an image or alteration of a real\nimage through generative AI or computer graphics techniques is labeled as a\nsynthetic image. To this end, this document aims to: present known strategies\nin detection and cryptography that can be employed to verify which images are\nreal, weight the strengths and weaknesses of these strategies, and suggest\nadditional improvements to alleviate shortcomings.\n","authors":["Alexander Vilesov","Yuan Tian","Nader Sehatbakhsh","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2407.04169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10175v2","updated":"2024-07-04T21:01:48Z","published":"2023-12-15T19:57:07Z","title":"ALOHA: from Attention to Likes -- a unified mOdel for understanding\n HumAn responses to diverse visual content","summary":" Progress in human behavior modeling involves understanding both implicit,\nearly-stage perceptual behavior such as human attention and explicit,\nlater-stage behavior such as subjective preferences/likes. Yet, most prior\nresearch has focused on modeling implicit and explicit human behavior in\nisolation; and often limited to a specific type of visual content. Can we build\na unified model of human attention and preference behavior that works reliably\nacross diverse types of visual content? Such a model would enable predicting\nsubjective feedback such as satisfaction or aesthetic quality, along with the\nunderlying human attention or interaction heatmaps and viewing order, enabling\ndesigners and content-creation models to optimize their creation for\nhuman-centric improvements. In this paper, we propose ALOHA -- a unified model\nfor understanding human responses from attention to likes, across diverse\nvisual content. ALOHA leverages a multimodal transformer % featuring distinct\nprediction heads for each facet, and predicts different human responses such as\nattention heatmaps, scanpath or viewing order, as well as subjective\nrating/preference. We train ALOHA on diverse public datasets spanning natural\nimages, webpages and graphic designs, and achieve SOTA performance on multiple\nbenchmarks across different image domains and various behavior modeling tasks.\nPotential applications include providing instant feedback on the effectiveness\nof UIs/designs/images, and serving as a reward model to further optimize\nvisual-content creation.\n","authors":["Peizhao Li","Junfeng He","Gang Li","Rachit Bhargava","Shaolei Shen","Nachiappan Valliappan","Youwei Liang","Hongxiang Gu","Venky Ramachandran","Golnaz Farhadi","Yang Li","Kai J Kohlhoff","Vidhya Navalpakkam"],"pdf_url":"https://arxiv.org/pdf/2312.10175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04149v1","updated":"2024-07-04T20:53:19Z","published":"2024-07-04T20:53:19Z","title":"SineKAN: Kolmogorov-Arnold Networks Using Sinusoidal Activation\n Functions","summary":" Recent work has established an alternative to traditional multi-layer\nperceptron neural networks in the form of Kolmogorov-Arnold Networks (KAN). The\ngeneral KAN framework uses learnable activation functions on the edges of the\ncomputational graph followed by summation on nodes. The learnable edge\nactivation functions in the original implementation are basis spline functions\n(B-Spline). Here, we present a model in which learnable grids of B-Spline\nactivation functions can be replaced by grids of re-weighted sine functions. We\nshow that this leads to better or comparable numerical performance to B-Spline\nKAN models on the MNIST benchmark, while also providing a substantial speed\nincrease on the order of 4-9 times.\n","authors":["Eric A. F. Reinhardt","Sergei Gleyzer"],"pdf_url":"https://arxiv.org/pdf/2407.04149v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2210.15701v2","updated":"2024-07-04T20:25:36Z","published":"2022-10-27T18:03:37Z","title":"Do Pre-trained Models Benefit Equally in Continual Learning?","summary":" Existing work on continual learning (CL) is primarily devoted to developing\nalgorithms for models trained from scratch. Despite their encouraging\nperformance on contrived benchmarks, these algorithms show dramatic performance\ndrops in real-world scenarios. Therefore, this paper advocates the systematic\nintroduction of pre-training to CL, which is a general recipe for transferring\nknowledge to downstream tasks but is substantially missing in the CL community.\nOur investigation reveals the multifaceted complexity of exploiting pre-trained\nmodels for CL, along three different axes, pre-trained models, CL algorithms,\nand CL scenarios. Perhaps most intriguingly, improvements in CL algorithms from\npre-training are very inconsistent an underperforming algorithm could become\ncompetitive and even state-of-the-art when all algorithms start from a\npre-trained model. This indicates that the current paradigm, where all CL\nmethods are compared in from-scratch training, is not well reflective of the\ntrue CL objective and desired progress. In addition, we make several other\nimportant observations, including that CL algorithms that exert less\nregularization benefit more from a pre-trained model; and that a stronger\npre-trained model such as CLIP does not guarantee a better improvement. Based\non these findings, we introduce a simple yet effective baseline that employs\nminimum regularization and leverages the more beneficial pre-trained model,\ncoupled with a two-stage training pipeline. We recommend including this strong\nbaseline in the future development of CL algorithms, due to its demonstrated\nstate-of-the-art performance.\n","authors":["Kuan-Ying Lee","Yuanyi Zhong","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2210.15701v2.pdf","comment":"Accepted to WACV 2023. Project page:\n https://kylee5.web.illinois.edu/publication/WACV23/"},{"id":"http://arxiv.org/abs/2407.04127v1","updated":"2024-07-04T19:00:34Z","published":"2024-07-04T19:00:34Z","title":"Biometric Authentication Based on Enhanced Remote Photoplethysmography\n Signal Morphology","summary":" Remote photoplethysmography (rPPG) is a non-contact method for measuring\ncardiac signals from facial videos, offering a convenient alternative to\ncontact photoplethysmography (cPPG) obtained from contact sensors. Recent\nstudies have shown that each individual possesses a unique cPPG signal\nmorphology that can be utilized as a biometric identifier, which has inspired\nus to utilize the morphology of rPPG signals extracted from facial videos for\nperson authentication. Since the facial appearance and rPPG are mixed in the\nfacial videos, we first de-identify facial videos to remove facial appearance\nwhile preserving the rPPG information, which protects facial privacy and\nguarantees that only rPPG is used for authentication. The de-identified videos\nare fed into an rPPG model to get the rPPG signal morphology for\nauthentication. In the first training stage, unsupervised rPPG training is\nperformed to get coarse rPPG signals. In the second training stage, an\nrPPG-cPPG hybrid training is performed by incorporating external cPPG datasets\nto achieve rPPG biometric authentication and enhance rPPG signal morphology.\nOur approach needs only de-identified facial videos with subject IDs to train\nrPPG authentication models. The experimental results demonstrate that rPPG\nsignal morphology hidden in facial videos can be used for biometric\nauthentication. The code is available at\nhttps://github.com/zhaodongsun/rppg_biometrics.\n","authors":["Zhaodong Sun","Xiaobai Li","Jukka Komulainen","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.04127v1.pdf","comment":"accepted by IJCB 2024"},{"id":"http://arxiv.org/abs/2311.03355v2","updated":"2024-07-04T18:59:18Z","published":"2023-11-06T18:59:57Z","title":"SegGen: Supercharging Segmentation Models with Text2Mask and Mask2Img\n Synthesis","summary":" We propose SegGen, a highly-effective training data generation method for\nimage segmentation, which pushes the performance limits of state-of-the-art\nsegmentation models to a significant extent. SegGen designs and integrates two\ndata generation strategies: MaskSyn and ImgSyn. (i) MaskSyn synthesizes new\nmask-image pairs via our proposed text-to-mask generation model and\nmask-to-image generation model, greatly improving the diversity in segmentation\nmasks for model supervision; (ii) ImgSyn synthesizes new images based on\nexisting masks using the mask-to-image generation model, strongly improving\nimage diversity for model inputs. On the highly competitive ADE20K and COCO\nbenchmarks, our data generation method markedly improves the performance of\nstate-of-the-art segmentation models in semantic segmentation, panoptic\nsegmentation, and instance segmentation. Notably, in terms of the ADE20K mIoU,\nMask2Former R50 is largely boosted from 47.2 to 49.9 (+2.7); Mask2Former Swin-L\nis also significantly increased from 56.1 to 57.4 (+1.3). These promising\nresults strongly suggest the effectiveness of our SegGen even when abundant\nhuman-annotated training data is utilized. Moreover, training with our\nsynthetic data makes the segmentation models more robust towards unseen\ndomains. Project website: https://seggenerator.github.io\n","authors":["Hanrong Ye","Jason Kuen","Qing Liu","Zhe Lin","Brian Price","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2311.03355v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04119v1","updated":"2024-07-04T18:40:50Z","published":"2024-07-04T18:40:50Z","title":"An Autoencoder Architecture for L-band Passive Microwave Retrieval of\n Landscape Freeze-Thaw Cycle","summary":" Estimating the landscape and soil freeze-thaw (FT) dynamics in the Northern\nHemisphere is crucial for understanding permafrost response to global warming\nand changes in regional and global carbon budgets. A new framework is presented\nfor surface FT-cycle retrievals using L-band microwave radiometry based on a\ndeep convolutional autoencoder neural network. This framework defines the\nlandscape FT-cycle retrieval as a time series anomaly detection problem\nconsidering the frozen states as normal and thawed states as anomalies. The\nautoencoder retrieves the FT-cycle probabilistically through supervised\nreconstruction of the brightness temperature (TB) time series using a\ncontrastive loss function that minimizes (maximizes) the reconstruction error\nfor the peak winter (summer). Using the data provided by the Soil Moisture\nActive Passive (SMAP) satellite, it is demonstrated that the framework learns\nto isolate the landscape FT states over different land surface types with\nvarying complexities related to the radiometric characteristics of snow cover,\nlake-ice phenology, and vegetation canopy. The consistency of the retrievals is\nevaluated over Alaska, against in situ ground-based observations, showing\nreduced uncertainties compared to the traditional methods that use thresholding\nof the normalized polarization ratio.\n","authors":["Divya Kumawat","Ardeshir Ebtehaj","Xiaolan Xu","Andreas Colliander","Vipin Kumar"],"pdf_url":"https://arxiv.org/pdf/2407.04119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04106v1","updated":"2024-07-04T18:21:10Z","published":"2024-07-04T18:21:10Z","title":"MiniGPT-Med: Large Language Model as a General Interface for Radiology\n Diagnosis","summary":" Recent advancements in artificial intelligence (AI) have precipitated\nsignificant breakthroughs in healthcare, particularly in refining diagnostic\nprocedures. However, previous studies have often been constrained to limited\nfunctionalities. This study introduces MiniGPT-Med, a vision-language model\nderived from large-scale language models and tailored for medical applications.\nMiniGPT-Med demonstrates remarkable versatility across various imaging\nmodalities, including X-rays, CT scans, and MRIs, enhancing its utility. The\nmodel is capable of performing tasks such as medical report generation, visual\nquestion answering (VQA), and disease identification within medical imagery.\nIts integrated processing of both image and textual clinical data markedly\nimproves diagnostic accuracy. Our empirical assessments confirm MiniGPT-Med's\nsuperior performance in disease grounding, medical report generation, and VQA\nbenchmarks, representing a significant step towards reducing the gap in\nassisting radiology practice. Furthermore, it achieves state-of-the-art\nperformance on medical report generation, higher than the previous best model\nby 19\\% accuracy. MiniGPT-Med promises to become a general interface for\nradiology diagnoses, enhancing diagnostic efficiency across a wide range of\nmedical imaging applications.\n","authors":["Asma Alkhaldi","Raneem Alnajim","Layan Alabdullatef","Rawan Alyahya","Jun Chen","Deyao Zhu","Ahmed Alsinan","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2407.04106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04103v1","updated":"2024-07-04T18:06:48Z","published":"2024-07-04T18:06:48Z","title":"Advances in Diffusion Models for Image Data Augmentation: A Review of\n Methods, Models, Evaluation Metrics and Future Research Directions","summary":" Image data augmentation constitutes a critical methodology in modern computer\nvision tasks, since it can facilitate towards enhancing the diversity and\nquality of training datasets; thereby, improving the performance and robustness\nof machine learning models in downstream tasks. In parallel, augmentation\napproaches can also be used for editing/modifying a given image in a context-\nand semantics-aware way. Diffusion Models (DMs), which comprise one of the most\nrecent and highly promising classes of methods in the field of generative\nArtificial Intelligence (AI), have emerged as a powerful tool for image data\naugmentation, capable of generating realistic and diverse images by learning\nthe underlying data distribution. The current study realizes a systematic,\ncomprehensive and in-depth review of DM-based approaches for image\naugmentation, covering a wide range of strategies, tasks and applications. In\nparticular, a comprehensive analysis of the fundamental principles, model\narchitectures and training strategies of DMs is initially performed.\nSubsequently, a taxonomy of the relevant image augmentation methods is\nintroduced, focusing on techniques regarding semantic manipulation,\npersonalization and adaptation, and application-specific augmentation tasks.\nThen, performance assessment methodologies and respective evaluation metrics\nare analyzed. Finally, current challenges and future research directions in the\nfield are discussed.\n","authors":["Panagiotis Alimisis","Ioannis Mademlis","Panagiotis Radoglou-Grammatikis","Panagiotis Sarigiannidis","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2407.04103v1.pdf","comment":"53 pages, 15 figures"},{"id":"http://arxiv.org/abs/2407.04100v1","updated":"2024-07-04T18:03:45Z","published":"2024-07-04T18:03:45Z","title":"C$^3$DG: Conditional Domain Generalization for Hyperspectral Imagery\n Classification with Convergence and Constrained-risk Theories","summary":" Hyperspectral imagery (HSI) classification may suffer the challenge of\nhyperspectral-monospectra, where different classes present similar spectra.\nJoint spatial-spectral feature extraction is a popular solution for the\nproblem, but this strategy tends to inflate accuracy since test pixels may\nexist in training patches. Domain generalization methods show promising\npotential, but they still fail to distinguish similar spectra across varying\ndomains, in addition, the theoretical support is usually ignored. In this\npaper, we only rely on spectral information to solve the\nhyperspectral-monospectra problem, and propose a Convergence and\nError-Constrained Conditional Domain Generalization method for Hyperspectral\nImagery Classification (C$^3$DG). The major contributions of this paper include\ntwo aspects: the Conditional Revising Inference Block (CRIB), and the\ncorresponding theories for model convergence and generalization errors. CRIB is\nthe kernel structure of the proposed method, which employs a shared encoder and\nmulti-branch decoders to fully leverage the conditional distribution during\ntraining, achieving a decoupling that aligns with the generation mechanisms of\nHSI. Moreover, to ensure model convergence and maintain controllable error, we\npropose the optimization convergence theorem and risk upper bound theorem. In\nthe optimization convergence theorem, we ensure the model convergence by\ndemonstrating that the gradients of the loss terms are not contradictory. In\nthe risk upper bound theorem, our theoretical analysis explores the\nrelationship between test-time training and recent related work to establish a\nconcrete bound for error. Experimental results on three benchmark datasets\nindicate the superiority of C$^3$DG.\n","authors":["Zhe Gao","Bin Pan","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2407.04100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04092v1","updated":"2024-07-04T17:59:26Z","published":"2024-07-04T17:59:26Z","title":"Looking for Tiny Defects via Forward-Backward Feature Transfer","summary":" Motivated by efficiency requirements, most anomaly detection and segmentation\n(AD&S) methods focus on processing low-resolution images, e.g., $224\\times 224$\npixels, obtained by downsampling the original input images. In this setting,\ndownsampling is typically applied also to the provided ground-truth defect\nmasks. Yet, as numerous industrial applications demand identification of both\nlarge and tiny defects, the above-described protocol may fall short in\nproviding a realistic picture of the actual performance attainable by current\nmethods. Hence, in this work, we introduce a novel benchmark that evaluates\nmethods on the original, high-resolution image and ground-truth masks, focusing\non segmentation performance as a function of the size of anomalies. Our\nbenchmark includes a metric that captures robustness with respect to defect\nsize, i.e., the ability of a method to preserve good localization from large\nanomalies to tiny ones. Furthermore, we introduce an AD&S approach based on a\nnovel Teacher-Student paradigm which relies on two shallow MLPs (the Students)\nthat learn to transfer patch features across the layers of a frozen vision\ntransformer (the Teacher). By means of our benchmark, we evaluate our proposal\nand other recent AD&S methods on high-resolution inputs containing large and\ntiny defects. Our proposal features the highest robustness to defect size, runs\nat the fastest speed, yields state-of-the-art performance on the MVTec AD\ndataset and state-of-the-art segmentation performance on the VisA dataset.\n","authors":["Alex Costanzino","Pierluigi Zama Ramirez","Giuseppe Lisanti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2407.04092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04086v1","updated":"2024-07-04T17:56:04Z","published":"2024-07-04T17:56:04Z","title":"Certifiably Robust Image Watermark","summary":" Generative AI raises many societal concerns such as boosting disinformation\nand propaganda campaigns. Watermarking AI-generated content is a key technology\nto address these concerns and has been widely deployed in industry. However,\nwatermarking is vulnerable to removal attacks and forgery attacks. In this\nwork, we propose the first image watermarks with certified robustness\nguarantees against removal and forgery attacks. Our method leverages randomized\nsmoothing, a popular technique to build certifiably robust classifiers and\nregression models. Our major technical contributions include extending\nrandomized smoothing to watermarking by considering its unique characteristics,\nderiving the certified robustness guarantees, and designing algorithms to\nestimate them. Moreover, we extensively evaluate our image watermarks in terms\nof both certified and empirical robustness. Our code is available at\n\\url{https://github.com/zhengyuan-jiang/Watermark-Library}.\n","authors":["Zhengyuan Jiang","Moyang Guo","Yuepeng Hu","Jinyuan Jia","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2407.04086v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2407.04085v1","updated":"2024-07-04T17:53:37Z","published":"2024-07-04T17:53:37Z","title":"FIPGNet:Pyramid grafting network with feature interaction strategies","summary":" Salient object detection is designed to identify the objects in an image that\nattract the most visual attention.Currently, the most advanced method of\nsignificance object detection adopts pyramid grafting network\narchitecture.However, pyramid-graft network architecture still has the problem\nof failing to accurately locate significant targets.We observe that this is\nmainly due to the fact that current salient object detection methods simply\naggregate different scale features, ignoring the correlation between different\nscale features.To overcome these problems, we propose a new salience object\ndetection framework(FIPGNet),which is a pyramid graft network with feature\ninteraction strategies.Specifically, we propose an attention-mechanism based\nfeature interaction strategy (FIA) that innovatively introduces spatial agent\nCross Attention (SACA) to achieve multi-level feature interaction, highlighting\nimportant spatial regions from a spatial perspective, thereby enhancing salient\nregions.And the channel proxy Cross Attention Module (CCM), which is used to\neffectively connect the features extracted by the backbone network and the\nfeatures processed using the spatial proxy cross attention module, eliminating\ninconsistencies.Finally, under the action of these two modules, the prominent\ntarget location problem in the current pyramid grafting network model is\nsolved.Experimental results on six challenging datasets show that the proposed\nmethod outperforms the current 12 salient object detection methods on four\nindicators.\n","authors":["Ziyi Ding","Like Xin"],"pdf_url":"https://arxiv.org/pdf/2407.04085v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2309.08365 by other authors"},{"id":"http://arxiv.org/abs/2312.09181v3","updated":"2024-07-04T17:52:58Z","published":"2023-12-14T17:48:09Z","title":"Improving Efficiency of Diffusion Models via Multi-Stage Framework and\n Tailored Multi-Decoder Architectures","summary":" Diffusion models, emerging as powerful deep generative tools, excel in\nvarious applications. They operate through a two-steps process: introducing\nnoise into training samples and then employing a model to convert random noise\ninto new samples (e.g., images). However, their remarkable generative\nperformance is hindered by slow training and sampling. This is due to the\nnecessity of tracking extensive forward and reverse diffusion trajectories, and\nemploying a large model with numerous parameters across multiple timesteps\n(i.e., noise levels). To tackle these challenges, we present a multi-stage\nframework inspired by our empirical findings. These observations indicate the\nadvantages of employing distinct parameters tailored to each timestep while\nretaining universal parameters shared across all time steps. Our approach\ninvolves segmenting the time interval into multiple stages where we employ\ncustom multi-decoder U-net architecture that blends time-dependent models with\na universally shared encoder. Our framework enables the efficient distribution\nof computational resources and mitigates inter-stage interference, which\nsubstantially improves training efficiency. Extensive numerical experiments\naffirm the effectiveness of our framework, showcasing significant training and\nsampling efficiency enhancements on three state-of-the-art diffusion models,\nincluding large-scale latent diffusion models. Furthermore, our ablation\nstudies illustrate the impact of two important components in our framework: (i)\na novel timestep clustering algorithm for stage division, and (ii) an\ninnovative multi-decoder U-net architecture, seamlessly integrating universal\nand customized hyperparameters.\n","authors":["Huijie Zhang","Yifu Lu","Ismail Alkhouri","Saiprasad Ravishankar","Dogyoon Song","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2312.09181v3.pdf","comment":"The IEEE/CVF Conference on Computer Vision and Pattern Recognition\n (CVPR) 2024"},{"id":"http://arxiv.org/abs/2407.04068v1","updated":"2024-07-04T17:14:18Z","published":"2024-07-04T17:14:18Z","title":"CLIP-DR: Textual Knowledge-Guided Diabetic Retinopathy Grading with\n Ranking-aware Prompting","summary":" Diabetic retinopathy (DR) is a complication of diabetes and usually takes\ndecades to reach sight-threatening levels. Accurate and robust detection of DR\nseverity is critical for the timely management and treatment of diabetes.\nHowever, most current DR grading methods suffer from insufficient robustness to\ndata variability (\\textit{e.g.} colour fundus images), posing a significant\ndifficulty for accurate and robust grading. In this work, we propose a novel DR\ngrading framework CLIP-DR based on three observations: 1) Recent pre-trained\nvisual language models, such as CLIP, showcase a notable capacity for\ngeneralisation across various downstream tasks, serving as effective baseline\nmodels. 2) The grading of image-text pairs for DR often adheres to a\ndiscernible natural sequence, yet most existing DR grading methods have\nprimarily overlooked this aspect. 3) A long-tailed distribution among DR\nseverity levels complicates the grading process. This work proposes a novel\nranking-aware prompting strategy to help the CLIP model exploit the ordinal\ninformation. Specifically, we sequentially design learnable prompts between\nneighbouring text-image pairs in two different ranking directions.\nAdditionally, we introduce a Similarity Matrix Smooth module into the structure\nof CLIP to balance the class distribution. Finally, we perform extensive\ncomparisons with several state-of-the-art methods on the GDRBench benchmark,\ndemonstrating our CLIP-DR's robustness and superior performance. The\nimplementation code is available\n\\footnote{\\url{https://github.com/Qinkaiyu/CLIP-DR}\n","authors":["Qinkai Yu","Jianyang Xie","Anh Nguyen","He Zhao","Jiong Zhang","Huazhu Fu","Yitian Zhao","Yalin Zheng","Yanda Meng"],"pdf_url":"https://arxiv.org/pdf/2407.04068v1.pdf","comment":"Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.04066v1","updated":"2024-07-04T17:13:06Z","published":"2024-07-04T17:13:06Z","title":"EMPL: A novel Efficient Meta Prompt Learning Framework for Few-shot\n Unsupervised Domain Adaptation","summary":" Few-shot unsupervised domain adaptation (FS-UDA) utilizes few-shot labeled\nsource domain data to realize effective classification in unlabeled target\ndomain. However, current FS-UDA methods are still suffer from two issues: 1)\nthe data from different domains can not be effectively aligned by few-shot\nlabeled data due to the large domain gaps, 2) it is unstable and time-consuming\nto generalize to new FS-UDA tasks.To address this issue, we put forward a novel\nEfficient Meta Prompt Learning Framework for FS-UDA. Within this framework, we\nuse pre-trained CLIP model as the feature learning base model. First, we design\ndomain-shared prompt learning vectors composed of virtual tokens, which mainly\nlearns the meta knowledge from a large number of meta tasks to mitigate domain\ngaps. Secondly, we also design a task-shared prompt learning network to\nadaptively learn specific prompt vectors for each task, which aims to realize\nfast adaptation and task generalization. Thirdly, we learn a task-specific\ncross-domain alignment projection and a task-specific classifier with\nclosed-form solutions for each meta task, which can efficiently adapt the model\nto new tasks in one step. The whole learning process is formulated as a bilevel\noptimization problem, and a good initialization of model parameters is learned\nthrough meta-learning. Extensive experimental study demonstrates the promising\nperformance of our framework on benchmark datasets. Our method has the large\nimprovement of at least 15.4% on 5-way 1-shot and 8.7% on 5-way 5-shot,\ncompared with the state-of-the-art methods. Also, the performance of our method\non all the test tasks is more stable than the other methods.\n","authors":["Wanqi Yang","Haoran Wang","Lei Wang","Ge Song","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2407.04066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04061v1","updated":"2024-07-04T17:06:16Z","published":"2024-07-04T17:06:16Z","title":"Detect Closer Surfaces that can be Seen: New Modeling and Evaluation in\n Cross-domain 3D Object Detection","summary":" The performance of domain adaptation technologies has not yet reached an\nideal level in the current 3D object detection field for autonomous driving,\nwhich is mainly due to significant differences in the size of vehicles, as well\nas the environments they operate in when applied across domains. These factors\ntogether hinder the effective transfer and application of knowledge learned\nfrom specific datasets. Since the existing evaluation metrics are initially\ndesigned for evaluation on a single domain by calculating the 2D or 3D overlap\nbetween the prediction and ground-truth bounding boxes, they often suffer from\nthe overfitting problem caused by the size differences among datasets. This\nraises a fundamental question related to the evaluation of the 3D object\ndetection models' cross-domain performance: Do we really need models to\nmaintain excellent performance in their original 3D bounding boxes after being\napplied across domains? From a practical application perspective, one of our\nmain focuses is actually on preventing collisions between vehicles and other\nobstacles, especially in cross-domain scenarios where correctly predicting the\nsize of vehicles is much more difficult. In other words, as long as a model can\naccurately identify the closest surfaces to the ego vehicle, it is sufficient\nto effectively avoid obstacles. In this paper, we propose two metrics to\nmeasure 3D object detection models' ability of detecting the closer surfaces to\nthe sensor on the ego vehicle, which can be used to evaluate their cross-domain\nperformance more comprehensively and reasonably. Furthermore, we propose a\nrefinement head, named EdgeHead, to guide models to focus more on the learnable\ncloser surfaces, which can greatly improve the cross-domain performance of\nexisting models not only under our new metrics, but even also under the\noriginal BEV/3D metrics.\n","authors":["Ruixiao Zhang","Yihong Wu","Juheon Lee","Adam Prugel-Bennett","Xiaohao Cai"],"pdf_url":"https://arxiv.org/pdf/2407.04061v1.pdf","comment":"Accepted by the 27th European Conference on Artificial Intelligence\n (ECAI 2024)"},{"id":"http://arxiv.org/abs/2407.04049v1","updated":"2024-07-04T16:46:22Z","published":"2024-07-04T16:46:22Z","title":"Occupancy as Set of Points","summary":" In this paper, we explore a novel point representation for 3D occupancy\nprediction from multi-view images, which is named Occupancy as Set of Points.\nExisting camera-based methods tend to exploit dense volume-based representation\nto predict the occupancy of the whole scene, making it hard to focus on the\nspecial areas or areas out of the perception range. In comparison, we present\nthe Points of Interest (PoIs) to represent the scene and propose OSP, a novel\nframework for point-based 3D occupancy prediction. Owing to the inherent\nflexibility of the point-based representation, OSP achieves strong performance\ncompared with existing methods and excels in terms of training and inference\nadaptability. It extends beyond traditional perception boundaries and can be\nseamlessly integrated with volume-based methods to significantly enhance their\neffectiveness. Experiments on the Occ3D nuScenes occupancy benchmark show that\nOSP has strong performance and flexibility. Code and models are available at\n\\url{https://github.com/hustvl/osp}.\n","authors":["Yiang Shi","Tianheng Cheng","Qian Zhang","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.04049v1.pdf","comment":"Accepted by ECCV 2024. Code and models: https://github.com/hustvl/osp"},{"id":"http://arxiv.org/abs/2407.04041v1","updated":"2024-07-04T16:29:05Z","published":"2024-07-04T16:29:05Z","title":"Towards Cross-View-Consistent Self-Supervised Surround Depth Estimation","summary":" Depth estimation is a cornerstone for autonomous driving, yet acquiring\nper-pixel depth ground truth for supervised learning is challenging.\nSelf-Supervised Surround Depth Estimation (SSSDE) from consecutive images\noffers an economical alternative. While previous SSSDE methods have proposed\ndifferent mechanisms to fuse information across images, few of them explicitly\nconsider the cross-view constraints, leading to inferior performance,\nparticularly in overlapping regions. This paper proposes an efficient and\nconsistent pose estimation design and two loss functions to enhance cross-view\nconsistency for SSSDE. For pose estimation, we propose to use only front-view\nimages to reduce training memory and sustain pose estimation consistency. The\nfirst loss function is the dense depth consistency loss, which penalizes the\ndifference between predicted depths in overlapping regions. The second one is\nthe multi-view reconstruction consistency loss, which aims to maintain\nconsistency between reconstruction from spatial and spatial-temporal contexts.\nAdditionally, we introduce a novel flipping augmentation to improve the\nperformance further. Our techniques enable a simple neural model to achieve\nstate-of-the-art performance on the DDAD and nuScenes datasets. Last but not\nleast, our proposed techniques can be easily applied to other methods. The code\nwill be made public.\n","authors":["Laiyan Ding","Hualie Jiang","Jie Li","Yongquan Chen","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2407.04041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04036v1","updated":"2024-07-04T16:21:29Z","published":"2024-07-04T16:21:29Z","title":"Beyond Pixels: Semi-Supervised Semantic Segmentation with a Multi-scale\n Patch-based Multi-Label Classifier","summary":" Incorporating pixel contextual information is critical for accurate\nsegmentation. In this paper, we show that an effective way to incorporate\ncontextual information is through a patch-based classifier. This patch\nclassifier is trained to identify classes present within an image region, which\nfacilitates the elimination of distractors and enhances the classification of\nsmall object segments. Specifically, we introduce Multi-scale Patch-based\nMulti-label Classifier (MPMC), a novel plug-in module designed for existing\nsemi-supervised segmentation (SSS) frameworks. MPMC offers patch-level\nsupervision, enabling the discrimination of pixel regions of different classes\nwithin a patch. Furthermore, MPMC learns an adaptive pseudo-label weight, using\npatch-level classification to alleviate the impact of the teacher's noisy\npseudo-label supervision the student. This lightweight module can be integrated\ninto any SSS framework, significantly enhancing their performance. We\ndemonstrate the efficacy of our proposed MPMC by integrating it into four SSS\nmethodologies and improving them across two natural image and one medical\nsegmentation dataset, notably improving the segmentation results of the\nbaselines across all the three datasets.\n","authors":["Prantik Howlader","Srijan Das","Hieu Le","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2407.04036v1.pdf","comment":"to be published in ECCV24"},{"id":"http://arxiv.org/abs/2308.09908v3","updated":"2024-07-04T16:18:04Z","published":"2023-08-19T05:15:02Z","title":"LEGO: Learning and Graph-Optimized Modular Tracker for Online\n Multi-Object Tracking with Point Clouds","summary":" Online multi-object tracking (MOT) plays a pivotal role in autonomous\nsystems. The state-of-the-art approaches usually employ a tracking-by-detection\nmethod, and data association plays a critical role. This paper proposes a\nlearning and graph-optimized (LEGO) modular tracker to improve data association\nperformance in the existing literature. The proposed LEGO tracker integrates\ngraph optimization and self-attention mechanisms, which efficiently formulate\nthe association score map, facilitating the accurate and efficient matching of\nobjects across time frames. To further enhance the state update process, the\nKalman filter is added to ensure consistent tracking by incorporating temporal\ncoherence in the object states. Our proposed method utilizing LiDAR alone has\nshown exceptional performance compared to other online tracking approaches,\nincluding LiDAR-based and LiDAR-camera fusion-based methods. LEGO ranked 1st at\nthe time of submitting results to KITTI object tracking evaluation ranking\nboard and remains 2nd at the time of submitting this paper, among all online\ntrackers in the KITTI MOT benchmark for cars1\n","authors":["Zhenrong Zhang","Jianan Liu","Yuxuan Xia","Tao Huang","Qing-Long Han","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09908v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04024v1","updated":"2024-07-04T16:09:52Z","published":"2024-07-04T16:09:52Z","title":"Adaptive Step-size Perception Unfolding Network with Non-local Hybrid\n Attention for Hyperspectral Image Reconstruction","summary":" Deep unfolding methods and transformer architecture have recently shown\npromising results in hyperspectral image (HSI) reconstruction. However, there\nstill exist two issues: (1) in the data subproblem, most methods represents the\nstepsize utilizing a learnable parameter. Nevertheless, for different spectral\nchannel, error between features and ground truth is unequal. (2) Transformer\nstruggles to balance receptive field size with pixel-wise detail information.\nTo overcome the aforementioned drawbacks, We proposed an adaptive step-size\nperception unfolding network (ASPUN), a deep unfolding network based on FISTA\nalgorithm, which uses an adaptive step-size perception module to estimate the\nupdate step-size of each spectral channel. In addition, we design a Non-local\nHybrid Attention Transformer(NHAT) module for fully leveraging the receptive\nfield advantage of transformer. By plugging the NLHA into the Non-local\nInformation Aggregation (NLIA) module, the unfolding network can achieve better\nreconstruction results. Experimental results show that our ASPUN is superior to\nthe existing SOTA algorithms and achieves the best performance.\n","authors":["Yanan Yang","Like Xin"],"pdf_url":"https://arxiv.org/pdf/2407.04024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11061v3","updated":"2024-07-04T16:08:38Z","published":"2024-01-19T23:34:48Z","title":"PhotoBot: Reference-Guided Interactive Photography via Natural Language","summary":" We introduce PhotoBot, a framework for fully automated photo acquisition\nbased on an interplay between high-level human language guidance and a robot\nphotographer. We propose to communicate photography suggestions to the user via\nreference images that are selected from a curated gallery. We leverage a visual\nlanguage model (VLM) and an object detector to characterize the reference\nimages via textual descriptions and then use a large language model (LLM) to\nretrieve relevant reference images based on a user's language query through\ntext-based reasoning. To correspond the reference image and the observed scene,\nwe exploit pre-trained features from a vision transformer capable of capturing\nsemantic similarity across marked appearance variations. Using these features,\nwe compute suggested pose adjustments for an RGB-D camera by solving a\nperspective-n-point (PnP) problem. We demonstrate our approach using a\nmanipulator equipped with a wrist camera. Our user studies show that photos\ntaken by PhotoBot are often more aesthetically pleasing than those taken by\nusers themselves, as measured by human feedback. We also show that PhotoBot can\ngeneralize to other reference sources such as paintings.\n","authors":["Oliver Limoyo","Jimmy Li","Dmitriy Rivkin","Jonathan Kelly","Gregory Dudek"],"pdf_url":"https://arxiv.org/pdf/2401.11061v3.pdf","comment":"Accepted to the IEEE/RSJ International Conference on Intelligent\n Robotics and Systems (IROS'24), Abu Dhabi, UAE, Oct 14-18, 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2406.07114v2","updated":"2024-07-04T20:08:38Z","published":"2024-06-11T09:58:27Z","title":"Unlocking the Potential of Metaverse in Innovative and Immersive Digital\n Health","summary":" The concept of Metaverse has attracted a lot of attention in various fields\nand one of its important applications is health and treatment. The Metaverse\nhas enormous potential to transform healthcare by changing patient care,\nmedical education, and the way teaching/learning and research are done. The\npurpose of this research is to provide an introduction to the basic concepts\nand fundamental technologies of the Metaverse. This paper examines the pros and\ncons of the Metaverse in healthcare context and analyzes its potential from the\ntechnology and AI perspective. In particular, the role of machine learning\nmethods is discussed; We will explain how machine learning algorithms can be\napplied to the Metaverse generated data to gain better insights in healthcare\napplications. Additionally, we examine the future visions of the Metaverse in\nhealth delivery, by examining emerging technologies such as blockchain and also\naddressing privacy concerns. The findings of this study contribute to a deeper\nunderstanding of the applications of Metaverse in healthcare and its potential\nto revolutionize the delivery of medical services.\n","authors":["Fatemeh Ebrahimzadeh","Ramin Safa"],"pdf_url":"https://arxiv.org/pdf/2406.07114v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.17745v3","updated":"2024-07-04T17:52:06Z","published":"2024-06-25T17:31:04Z","title":"Light-weight End-to-End Graph Interest Network for CTR Prediction in\n E-commerce Search","summary":" Click-through-rate (CTR) prediction has an essential impact on improving user\nexperience and revenue in e-commerce search. With the development of deep\nlearning, graph-based methods are well exploited to utilize graph structure\nextracted from user behaviors and other information to help embedding learning.\nHowever, most of the previous graph-based methods mainly focus on\nrecommendation scenarios, and therefore their graph structures highly depend on\nitem's sequential information from user behaviors, ignoring query's sequential\nsignal and query-item correlation. In this paper, we propose a new approach\nnamed Light-weight End-to-End Graph Interest Network (EGIN) to effectively mine\nusers' search interests and tackle previous challenges. (i) EGIN utilizes query\nand item's correlation and sequential information from the search system to\nbuild a heterogeneous graph for better CTR prediction in e-commerce search.\n(ii) EGIN's graph embedding learning shares the same training input and is\njointly trained with CTR prediction, making the end-to-end framework effortless\nto deploy in large-scale search systems. The proposed EGIN is composed of three\nparts: query-item heterogeneous graph, light-weight graph sampling, and\nmulti-interest network. The query-item heterogeneous graph captures correlation\nand sequential information of query and item efficiently by the proposed\nlight-weight graph sampling. The multi-interest network is well designed to\nutilize graph embedding to capture various similarity relationships between\nquery and item to enhance the final CTR prediction. We conduct extensive\nexperiments on both public and industrial datasets to demonstrate the\neffectiveness of the proposed EGIN. At the same time, the training cost of\ngraph learning is relatively low compared with the main CTR prediction task,\nensuring efficiency in practical applications.\n","authors":["Pipi Peng","Yunqing Jia","Ziqiang Zhou"," murmurhash","Zichong Xiao"],"pdf_url":"https://arxiv.org/pdf/2406.17745v3.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.04018v1","updated":"2024-07-04T15:50:18Z","published":"2024-07-04T15:50:18Z","title":"Leveraging Topic Specificity and Social Relationships for Expert Finding\n in Community Question Answering Platforms","summary":" Online Community Question Answering (CQA) platforms have become indispensable\ntools for users seeking expert solutions to their technical queries. The\neffectiveness of these platforms relies on their ability to identify and direct\nquestions to the most knowledgeable users within the community, a process known\nas Expert Finding (EF). EF accuracy is crucial for increasing user engagement\nand the reliability of provided answers. Despite recent advancements in EF\nmethodologies, blending the diverse information sources available on CQA\nplatforms for effective expert identification remains challenging. In this\npaper, we present TUEF, a Topic-oriented User-Interaction model for Expert\nFinding, which aims to fully and transparently leverage the heterogeneous\ninformation available within online question-answering communities. TUEF\nintegrates content and social data by constructing a multi-layer graph that\nmaps out user relationships based on their answering patterns on specific\ntopics. By combining these sources of information, TUEF identifies the most\nrelevant and knowledgeable users for any given question and ranks them using\nlearning-to-rank techniques. Our findings indicate that TUEF's topic-oriented\nmodel significantly enhances performance, particularly in large communities\ndiscussing well-defined topics. Additionally, we show that the interpretable\nlearning-to-rank algorithm integrated into TUEF offers transparency and\nexplainability with minimal performance trade-offs. The exhaustive experiments\nconducted on six different CQA communities of Stack Exchange show that TUEF\noutperforms all competitors with a minimum performance boost of 42.42% in P@1,\n32.73% in NDCG@3, 21.76% in R@5, and 29.81% in MRR, excelling in both the\nevaluation approaches present in the previous literature.\n","authors":["Maddalena Amendola","Andrea Passarella","Raffaele Perego"],"pdf_url":"https://arxiv.org/pdf/2407.04018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00579v2","updated":"2024-07-04T15:06:42Z","published":"2024-03-31T06:57:57Z","title":"A Review of Modern Recommender Systems Using Generative Models\n (Gen-RecSys)","summary":" Traditional recommender systems (RS) typically use user-item rating histories\nas their main data source. However, deep generative models now have the\ncapability to model and sample from complex data distributions, including\nuser-item interactions, text, images, and videos, enabling novel recommendation\ntasks. This comprehensive, multidisciplinary survey connects key advancements\nin RS using Generative Models (Gen-RecSys), covering: interaction-driven\ngenerative models; the use of large language models (LLM) and textual data for\nnatural language recommendation; and the integration of multimodal models for\ngenerating and processing images/videos in RS. Our work highlights necessary\nparadigms for evaluating the impact and harm of Gen-RecSys and identifies open\nchallenges. This survey accompanies a tutorial presented at ACM KDD'24, with\nsupporting materials provided at: https://encr.pw/vDhLq.\n","authors":["Yashar Deldjoo","Zhankui He","Julian McAuley","Anton Korikov","Scott Sanner","Arnau Ramisa","René Vidal","Maheswaran Sathiamoorthy","Atoosa Kasirzadeh","Silvia Milano"],"pdf_url":"https://arxiv.org/pdf/2404.00579v2.pdf","comment":"This survey accompanies a tutorial presented at ACM KDD'24"},{"id":"http://arxiv.org/abs/2406.17289v2","updated":"2024-07-04T14:54:07Z","published":"2024-06-25T05:35:02Z","title":"Hyperbolic Knowledge Transfer in Cross-Domain Recommendation System","summary":" Cross-Domain Recommendation (CDR) seeks to utilize knowledge from different\ndomains to alleviate the problem of data sparsity in the target recommendation\ndomain, and it has been gaining more attention in recent years. Although there\nhave been notable advancements in this area, most current methods represent\nusers and items in Euclidean space, which is not ideal for handling long-tail\ndistributed data in recommendation systems. Additionally, adding data from\nother domains can worsen the long-tail characteristics of the entire dataset,\nmaking it harder to train CDR models effectively. Recent studies have shown\nthat hyperbolic methods are particularly suitable for modeling long-tail\ndistributions, which has led us to explore hyperbolic representations for users\nand items in CDR scenarios. However, due to the distinct characteristics of the\ndifferent domains, applying hyperbolic representation learning to CDR tasks is\nquite challenging. In this paper, we introduce a new framework called\nHyperbolic Contrastive Learning (HCTS), designed to capture the unique features\nof each domain while enabling efficient knowledge transfer between domains. We\nachieve this by embedding users and items from each domain separately and\nmapping them onto distinct hyperbolic manifolds with adjustable curvatures for\nprediction. To improve the representations of users and items in the target\ndomain, we develop a hyperbolic contrastive learning module for knowledge\ntransfer. Extensive experiments on real-world datasets demonstrate that\nhyperbolic manifolds are a promising alternative to Euclidean space for CDR\ntasks.\n","authors":["Xin Yang","Heng Chang","Zhijian Lai","Jinze Yang","Xingrun Li","Yu Lu","Shuaiqiang Wang","Dawei Yin","Erxue Min"],"pdf_url":"https://arxiv.org/pdf/2406.17289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10545v3","updated":"2024-07-04T12:59:01Z","published":"2024-01-19T08:09:20Z","title":"Understanding Biases in ChatGPT-based Recommender Systems: Provider\n Fairness, Temporal Stability, and Recency","summary":" This paper explores the biases in ChatGPT-based recommender systems, focusing\non provider fairness (item-side fairness). Through extensive experiments and\nover a thousand API calls, we investigate the impact of prompt design\nstrategies-including structure, system role, and intent-on evaluation metrics\nsuch as provider fairness, catalog coverage, temporal stability, and recency.\nThe first experiment examines these strategies in classical top-K\nrecommendations, while the second evaluates sequential in-context learning\n(ICL).\n In the first experiment, we assess seven distinct prompt scenarios on top-K\nrecommendation accuracy and fairness. Accuracy-oriented prompts, like Simple\nand Chain-of-Thought (COT), outperform diversification prompts, which, despite\nenhancing temporal freshness, reduce accuracy by up to 50%. Embedding fairness\ninto system roles, such as \"act as a fair recommender,\" proved more effective\nthan fairness directives within prompts. Diversification prompts led to\nrecommending newer movies, offering broader genre distribution compared to\ntraditional collaborative filtering (CF) models.\n The second experiment explores sequential ICL, comparing zero-shot and\nfew-shot ICL. Results indicate that including user demographic information in\nprompts affects model biases and stereotypes. However, ICL did not consistently\nimprove item fairness and catalog coverage over zero-shot learning. Zero-shot\nlearning achieved higher NDCG and coverage, while ICL-2 showed slight\nimprovements in hit rate (HR) when age-group context was included. Our study\nprovides insights into biases of RecLLMs, particularly in provider fairness and\ncatalog coverage. By examining prompt design, learning strategies, and system\nroles, we highlight the potential and challenges of integrating LLMs into\nrecommendation systems. Further details can be found at\nhttps://github.com/yasdel/Benchmark_RecLLM_Fairness.\n","authors":["Yashar Deldjoo"],"pdf_url":"https://arxiv.org/pdf/2401.10545v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04413v2","updated":"2024-07-04T10:35:49Z","published":"2023-01-11T11:41:38Z","title":"CoSPLADE: Contextualizing SPLADE for Conversational Information\n Retrieval","summary":" Conversational search is a difficult task as it aims at retrieving documents\nbased not only on the current user query but also on the full conversation\nhistory. Most of the previous methods have focused on a multi-stage ranking\napproach relying on query reformulation, a critical intermediate step that\nmight lead to a sub-optimal retrieval. Other approaches have tried to use a\nfully neural IR first-stage, but are either zero-shot or rely on full\nlearning-to-rank based on a dataset with pseudo-labels. In this work,\nleveraging the CANARD dataset, we propose an innovative lightweight learning\ntechnique to train a first-stage ranker based on SPLADE. By relying on SPLADE\nsparse representations, we show that, when combined with a second-stage ranker\nbased on T5Mono, the results are competitive on the TREC CAsT 2020 and 2021\ntracks.\n","authors":["Nam Le Hai","Thomas Gerald","Thibault Formal","Jian-Yun Nie","Benjamin Piwowarski","Laure Soulier"],"pdf_url":"https://arxiv.org/pdf/2301.04413v2.pdf","comment":"Accepted at ECIR 2023"},{"id":"http://arxiv.org/abs/2405.09334v2","updated":"2024-07-04T09:00:32Z","published":"2024-05-15T13:34:07Z","title":"Content-Based Image Retrieval for Multi-Class Volumetric Radiology\n Images: A Benchmark Study","summary":" While content-based image retrieval (CBIR) has been extensively studied in\nnatural image retrieval, its application to medical images presents ongoing\nchallenges, primarily due to the 3D nature of medical images. Recent studies\nhave shown the potential use of pre-trained vision embeddings for CBIR in the\ncontext of radiology image retrieval. However, a benchmark for the retrieval of\n3D volumetric medical images is still lacking, hindering the ability to\nobjectively evaluate and compare the efficiency of proposed CBIR approaches in\nmedical imaging. In this study, we extend previous work and establish a\nbenchmark for region-based and localized multi-organ retrieval using the\nTotalSegmentator dataset (TS) with detailed multi-organ annotations. We\nbenchmark embeddings derived from pre-trained supervised models on medical\nimages against embeddings derived from pre-trained unsupervised models on\nnon-medical images for 29 coarse and 104 detailed anatomical structures in\nvolume and region levels. For volumetric image retrieval, we adopt a late\ninteraction re-ranking method inspired by text matching. We compare it against\nthe original method proposed for volume and region retrieval and achieve a\nretrieval recall of 1.0 for diverse anatomical regions with a wide size range.\nThe findings and methodologies presented in this paper provide insights and\nbenchmarks for further development and evaluation of CBIR approaches in the\ncontext of medical imaging.\n","authors":["Farnaz Khun Jush","Steffen Vogler","Tuan Truong","Matthias Lenga"],"pdf_url":"https://arxiv.org/pdf/2405.09334v2.pdf","comment":"34 pages, 12 Figures, 22 Tables"},{"id":"http://arxiv.org/abs/2407.03720v1","updated":"2024-07-04T08:08:33Z","published":"2024-07-04T08:08:33Z","title":"Query-oriented Data Augmentation for Session Search","summary":" Modeling contextual information in a search session has drawn more and more\nattention when understanding complex user intents. Recent methods are all\ndata-driven, i.e., they train different models on large-scale search log data\nto identify the relevance between search contexts and candidate documents. The\ncommon training paradigm is to pair the search context with different candidate\ndocuments and train the model to rank the clicked documents higher than the\nunclicked ones. However, this paradigm neglects the symmetric nature of the\nrelevance between the session context and document, i.e., the clicked documents\ncan also be paired with different search contexts when training. In this work,\nwe propose query-oriented data augmentation to enrich search logs and empower\nthe modeling. We generate supplemental training pairs by altering the most\nimportant part of a search context, i.e., the current query, and train our\nmodel to rank the generated sequence along with the original sequence. This\napproach enables models to learn that the relevance of a document may vary as\nthe session context changes, leading to a better understanding of users' search\npatterns. We develop several strategies to alter the current query, resulting\nin new training data with varying degrees of difficulty. Through\nexperimentation on two extensive public search logs, we have successfully\ndemonstrated the effectiveness of our model.\n","authors":["Haonan Chen","Zhicheng Dou","Yutao Zhu","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2407.03720v1.pdf","comment":"TKDE 2024"},{"id":"http://arxiv.org/abs/2407.03665v1","updated":"2024-07-04T06:09:11Z","published":"2024-07-04T06:09:11Z","title":"Heterogeneous Hypergraph Embedding for Recommendation Systems","summary":" Recent advancements in recommender systems have focused on integrating\nknowledge graphs (KGs) to leverage their auxiliary information. The core idea\nof KG-enhanced recommenders is to incorporate rich semantic information for\nmore accurate recommendations. However, two main challenges persist: i)\nNeglecting complex higher-order interactions in the KG-based user-item network,\npotentially leading to sub-optimal recommendations, and ii) Dealing with the\nheterogeneous modalities of input sources, such as user-item bipartite graphs\nand KGs, which may introduce noise and inaccuracies. To address these issues,\nwe present a novel Knowledge-enhanced Heterogeneous Hypergraph Recommender\nSystem (KHGRec). KHGRec captures group-wise characteristics of both the\ninteraction network and the KG, modeling complex connections in the KG. Using a\ncollaborative knowledge heterogeneous hypergraph (CKHG), it employs two\nhypergraph encoders to model group-wise interdependencies and ensure\nexplainability. Additionally, it fuses signals from the input graphs with\ncross-view self-supervised learning and attention mechanisms. Extensive\nexperiments on four real-world datasets show our model's superiority over\nvarious state-of-the-art baselines, with an average 5.18\\% relative\nimprovement. Additional tests on noise resilience, missing data, and cold-start\nproblems demonstrate the robustness of our KHGRec framework. Our model and\nevaluation datasets are publicly available at\n\\url{https://github.com/viethungvu1998/KHGRec}.\n","authors":["Darnbi Sakong","Viet Hung Vu","Thanh Trung Huynh","Phi Le Nguyen","Hongzhi Yin","Quoc Viet Hung Nguyen","Thanh Tam Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.03665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03650v1","updated":"2024-07-04T05:45:56Z","published":"2024-07-04T05:45:56Z","title":"Reviewers of Educational Immersive and Extended Reality (XR)\n experiences: Who is creating these reviews and why?","summary":" This paper presents a scoping review of literature to examine who is\nreviewing educational immersive or extended reality - eduXR experiences and\nwhy. EduXR experiences in augmented, virtual or mixed reality take many forms,\nfrom supporting manual training, engaging learners in conservation, to provide\nopportunities for social connection. For users of eduXR, reviews of an\nexperience can provide information that helps them determine whether it will\nmeet their learning needs or not. The source of the review, that is, who they\nare and why they have conducted the review, is critical in helping the user\njudge the reviews quality and relevance. At present, there is no settled review\nsystem in place for eduXR, though relevant frameworks exist for serious games\nreview with relevance and overlap for some, but not all, eduXR experiences.\nWhile some authors have engaged in preparing a detailed review structure for\neduXR, there remains a need for a clear and simple way for users of eduXR to\nknow details about reviewers, e.g., who and why, to help make it easier for\nusers to identify relevant reviews and gain useful insight about eduXR\nexperiences. To help address this issue, we conducted a scoping review asking\nthe question; Who is creating eduXR reviews, and why? We identified 16 papers\nthat present an academic evaluation on the review process of eduXR reviews. The\n16 papers were analysed, coding for who themes and why themes over two separate\ncycles, using thematic analysis. An analysis looked to examine what we know\nregarding who is providing the reviews, and why, to help us to understand what\nenables, inhibits and what is yet unknown about how the eduXR community goes\nabout making informed choices regarding the eduXR experiences they engage with.\n","authors":["Sophie McKenzie","Shaun Bangay","Maria Nicholas","Adam Cardilini","Majeet Singh"],"pdf_url":"https://arxiv.org/pdf/2407.03650v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2407.00317v2","updated":"2024-07-04T05:11:10Z","published":"2024-06-29T04:48:39Z","title":"Towards Statistically Significant Taxonomy Aware Co-location Pattern\n Detection","summary":" Given a collection of Boolean spatial feature types, their instances, a\nneighborhood relation (e.g., proximity), and a hierarchical taxonomy of the\nfeature types, the goal is to find the subsets of feature types or their\nparents whose spatial interaction is statistically significant. This problem is\nfor taxonomy-reliant applications such as ecology (e.g., finding new symbiotic\nrelationships across the food chain), spatial pathology (e.g., immunotherapy\nfor cancer), retail, etc. The problem is computationally challenging due to the\nexponential number of candidate co-location patterns generated by the taxonomy.\nMost approaches for co-location pattern detection overlook the hierarchical\nrelationships among spatial features, and the statistical significance of the\ndetected patterns is not always considered, leading to potential false\ndiscoveries. This paper introduces two methods for incorporating taxonomies and\nassessing the statistical significance of co-location patterns. The baseline\napproach iteratively checks the significance of co-locations between leaf nodes\nor their ancestors in the taxonomy. Using the Benjamini-Hochberg procedure, an\nadvanced approach is proposed to control the false discovery rate. This\napproach effectively reduces the risk of false discoveries while maintaining\nthe power to detect true co-location patterns. Experimental evaluation and case\nstudy results show the effectiveness of the approach.\n","authors":["Subhankar Ghosh","Arun Sharma","Jayant Gupta","Shashi Shekhar"],"pdf_url":"https://arxiv.org/pdf/2407.00317v2.pdf","comment":"Accepted in The 16th Conference on Spatial Information Theory (COSIT)\n 2024"},{"id":"http://arxiv.org/abs/2407.03618v1","updated":"2024-07-04T04:01:05Z","published":"2024-07-04T04:01:05Z","title":"BM25S: Orders of magnitude faster lexical search via eager sparse\n scoring","summary":" We introduce BM25S, an efficient Python-based implementation of BM25 that\nonly depends on Numpy and Scipy. BM25S achieves up to a 500x speedup compared\nto the most popular Python-based framework by eagerly computing BM25 scores\nduring indexing and storing them into sparse matrices. It also achieves\nconsiderable speedups compared to highly optimized Java-based implementations,\nwhich are used by popular commercial products. Finally, BM25S reproduces the\nexact implementation of five BM25 variants based on Kamphuis et al. (2020) by\nextending eager scoring to non-sparse variants using a novel score shifting\nmethod. The code can be found at https://github.com/xhluca/bm25s\n","authors":["Xing Han Lù"],"pdf_url":"https://arxiv.org/pdf/2407.03618v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2407.03580v1","updated":"2024-07-04T02:19:49Z","published":"2024-07-04T02:19:49Z","title":"Deep Pareto Reinforcement Learning for Multi-Objective Recommender\n System","summary":" Optimizing multiple objectives simultaneously is an important task in\nrecommendation platforms to improve their performance on different fronts.\nHowever, this task is particularly challenging since the relationships between\ndifferent objectives are heterogeneous across different consumers and\ndynamically fluctuating according to different contexts. Especially in those\ncases when objectives become conflicting with each other, the result of\nrecommendations will form a pareto-frontier, where the improvements on any\nobjective comes at the cost of a performance decrease in another objective.\nUnfortunately, existing multi-objective recommender systems do not\nsystematically consider such relationships; instead, they balance between these\nobjectives in a static and uniform manner, resulting in performance that is\nsignificantly worse than the pareto-optimality. In this paper, we propose a\nDeep Pareto Reinforcement Learning (DeepPRL) approach, where we (1)\ncomprehensively model the complex relationships between multiple objectives in\nrecommendations; (2) effectively capture the personalized and contextual\nconsumer preference towards each objective and update the recommendations\ncorrespondingly; (3) optimize both the short-term and the long-term performance\nof multi-objective recommendations. As a result, our method achieves\nsignificant pareto-dominance over state-of-the-art baselines in extensive\noffline experiments conducted on three real-world datasets. Furthermore, we\nconduct a large-scale online controlled experiment at the video streaming\nplatform of Alibaba, where our method simultaneously improves the three\nconflicting objectives of Click-Through Rate, Video View, and Dwell Time by 2%,\n5%, and 7% respectively over the latest production system, demonstrating its\ntangible economic impact in industrial applications.\n","authors":["Pan Li","Alexander Tuzhilin"],"pdf_url":"https://arxiv.org/pdf/2407.03580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03568v1","updated":"2024-07-04T01:43:52Z","published":"2024-07-04T01:43:52Z","title":"When LLM Meets Hypergraph: A Sociological Analysis on Personality via\n Online Social Networks","summary":" Individual personalities significantly influence our perceptions, decisions,\nand social interactions, which is particularly crucial for gaining insights\ninto human behavior patterns in online social network analysis. Many\npsychological studies have observed that personalities are strongly reflected\nin their social behaviors and social environments. In light of these problems,\nthis paper proposes a sociological analysis framework for one's personality in\nan environment-based view instead of individual-level data mining.\nSpecifically, to comprehensively understand an individual's behavior from\nlow-quality records, we leverage the powerful associative ability of LLMs by\ndesigning an effective prompt. In this way, LLMs can integrate various\nscattered information with their external knowledge to generate higher-quality\nprofiles, which can significantly improve the personality analysis performance.\nTo explore the interactive mechanism behind the users and their online\nenvironments, we design an effective hypergraph neural network where the\nhypergraph nodes are users and the hyperedges in the hypergraph are social\nenvironments. We offer a useful dataset with user profile data, personality\ntraits, and several detected environments from the real-world social platform.\nTo the best of our knowledge, this is the first network-based dataset\ncontaining both hypergraph structure and social information, which could push\nforward future research in this area further. By employing the framework on\nthis dataset, we can effectively capture the nuances of individual\npersonalities and their online behaviors, leading to a deeper understanding of\nhuman interactions in the digital world.\n","authors":["Zhiyao Shu","Xiangguo Sun","Hong Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.03568v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.19859v2","updated":"2024-07-04T15:47:40Z","published":"2024-06-28T11:58:26Z","title":"MetaDesigner: Advancing Artistic Typography through AI-Driven,\n User-Centric, and Multilingual WordArt Synthesis","summary":" MetaDesigner revolutionizes artistic typography synthesis by leveraging the\nstrengths of Large Language Models (LLMs) to drive a design paradigm centered\naround user engagement. At the core of this framework lies a multi-agent system\ncomprising the Pipeline, Glyph, and Texture agents, which collectively enable\nthe creation of customized WordArt, ranging from semantic enhancements to the\nimposition of complex textures. MetaDesigner incorporates a comprehensive\nfeedback mechanism that harnesses insights from multimodal models and user\nevaluations to refine and enhance the design process iteratively. Through this\nfeedback loop, the system adeptly tunes hyperparameters to align with\nuser-defined stylistic and thematic preferences, generating WordArt that not\nonly meets but exceeds user expectations of visual appeal and contextual\nrelevance. Empirical validations highlight MetaDesigner's capability to\neffectively serve diverse WordArt applications, consistently producing\naesthetically appealing and context-sensitive results.\n","authors":["Jun-Yan He","Zhi-Qi Cheng","Chenyang Li","Jingdong Sun","Qi He","Wangmeng Xiang","Hanyuan Chen","Jin-Peng Lan","Xianhui Lin","Kang Zhu","Bin Luo","Yifeng Geng","Xuansong Xie","Alexander G. Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2406.19859v2.pdf","comment":"18 pages, 16 figures, Project:\n https://modelscope.cn/studios/WordArt/WordArt"},{"id":"http://arxiv.org/abs/2407.03860v1","updated":"2024-07-04T11:44:31Z","published":"2024-07-04T11:44:31Z","title":"Computational Thinking through Design Patterns in Video Games","summary":" Prior research has explored potential applications of video games in\nprogramming education to elicit computational thinking skills. However,\nexisting approaches are often either too general, not taking into account the\ndiversity of genres and mechanisms between video games, or too narrow,\nselecting tools that were specifically designed for educational purposes. In\nthis paper we propose a more fundamental approach, defining beneficial\nconnections between individual design patterns present in video games and\ncomputational thinking skills. We argue that video games have the capacity to\nelicit these skills and even to potentially train them. This could be an\neffective method to solidify a conceptual base which would make programming\neducation more effective.\n","authors":["Giulio Barbero","Marcello A. Gómez-Maureira","Felienne F. J. Hermans"],"pdf_url":"https://arxiv.org/pdf/2407.03860v1.pdf","comment":"draft"},{"id":"http://arxiv.org/abs/2407.03736v1","updated":"2024-07-04T08:37:47Z","published":"2024-07-04T08:37:47Z","title":"Semantic Grouping Network for Audio Source Separation","summary":" Recently, audio-visual separation approaches have taken advantage of the\nnatural synchronization between the two modalities to boost audio source\nseparation performance. They extracted high-level semantics from visual inputs\nas the guidance to help disentangle sound representation for individual\nsources. Can we directly learn to disentangle the individual semantics from the\nsound itself? The dilemma is that multiple sound sources are mixed together in\nthe original space. To tackle the difficulty, in this paper, we present a novel\nSemantic Grouping Network, termed as SGN, that can directly disentangle sound\nrepresentations and extract high-level semantic information for each source\nfrom input audio mixture. Specifically, SGN aggregates category-wise source\nfeatures through learnable class tokens of sounds. Then, the aggregated\nsemantic features can be used as the guidance to separate the corresponding\naudio sources from the mixture. We conducted extensive experiments on\nmusic-only and universal sound separation benchmarks: MUSIC, FUSS, MUSDB18, and\nVGG-Sound. The results demonstrate that our SGN significantly outperforms\nprevious audio-only methods and audio-visual models without utilizing\nadditional visual cues.\n","authors":["Shentong Mo","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2407.03736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06217v2","updated":"2024-07-04T05:16:47Z","published":"2023-11-10T18:13:08Z","title":"MultiIoT: Benchmarking Machine Learning for the Internet of Things","summary":" The next generation of machine learning systems must be adept at perceiving\nand interacting with the physical world through a diverse array of sensory\nchannels. Commonly referred to as the `Internet of Things (IoT)' ecosystem,\nsensory data from motion, thermal, geolocation, depth, wireless signals, video,\nand audio are increasingly used to model the states of physical environments\nand the humans inside them. Despite the potential for understanding human\nwellbeing, controlling physical devices, and interconnecting smart cities, the\ncommunity has seen limited benchmarks for building machine learning systems for\nIoT. Existing efforts are often specialized to a single sensory modality or\nprediction task, which makes it difficult to study and train large-scale models\nacross many IoT sensors and tasks. To accelerate the development of new machine\nlearning technologies for IoT, this paper proposes MultiIoT, the most expansive\nand unified IoT benchmark to date, encompassing over 1.15 million samples from\n12 modalities and 8 real-world tasks. MultiIoT introduces unique challenges\ninvolving (1) generalizable learning from many sensory modalities, (2)\nmultimodal interactions across long temporal ranges, (3) extreme heterogeneity\ndue to unique structure and noise topologies in real-world sensors, and (4)\ncomplexity during training and inference. We evaluate a comprehensive set of\nmodels on MultiIoT, including modality and task-specific methods, multisensory\nand multitask supervised models, and large multisensory foundation models. Our\nresults highlight opportunities for ML to make a significant impact in IoT, but\nmany challenges in scalable learning from heterogeneous, long-range, and\nimperfect sensory modalities still persist. We release all code and data to\naccelerate future research in machine learning for IoT.\n","authors":["Shentong Mo","Louis-Philippe Morency","Russ Salakhutdinov","Paul Pu Liang"],"pdf_url":"https://arxiv.org/pdf/2311.06217v2.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..5cad2446 --- /dev/null +++ b/index.html @@ -0,0 +1,66440 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 64 + +
+
+
+ + ☆ Me, Myself, and AI: The Situational Awareness Dataset (SAD) for LLMs + + +
+ AI assistants such as ChatGPT are trained to respond to users by saying, "I +am a large language model". This raises questions. Do such models know that +they are LLMs and reliably act on this knowledge? Are they aware of their +current circumstances, such as being deployed to the public? We refer to a +model's knowledge of itself and its circumstances as situational awareness. To +quantify situational awareness in LLMs, we introduce a range of behavioral +tests, based on question answering and instruction following. These tests form +the $\textbf{Situational Awareness Dataset (SAD)}$, a benchmark comprising 7 +task categories and over 13,000 questions. The benchmark tests numerous +abilities, including the capacity of LLMs to (i) recognize their own generated +text, (ii) predict their own behavior, (iii) determine whether a prompt is from +internal evaluation or real-world deployment, and (iv) follow instructions that +depend on self-knowledge. + We evaluate 16 LLMs on SAD, including both base (pretrained) and chat models. +While all models perform better than chance, even the highest-scoring model +(Claude 3 Opus) is far from a human baseline on certain tasks. We also observe +that performance on SAD is only partially predicted by metrics of general +knowledge (e.g. MMLU). Chat models, which are finetuned to serve as AI +assistants, outperform their corresponding base models on SAD but not on +general knowledge tasks. The purpose of SAD is to facilitate scientific +understanding of situational awareness in LLMs by breaking it down into +quantitative abilities. Situational awareness is important because it enhances +a model's capacity for autonomous planning and action. While this has potential +benefits for automation, it also introduces novel risks related to AI safety +and control. Code and latest results available at +https://situational-awareness-dataset.org . + +
+
+ comment: 11 page main body, 98 page appendix, 58 figures +
+
+
+
+
+ + ☆ ANAH-v2: Scaling Analytical Hallucination Annotation of Large Language + Models + + +
+ Large language models (LLMs) exhibit hallucinations in long-form +question-answering tasks across various domains and wide applications. Current +hallucination detection and mitigation datasets are limited in domains and +sizes, which struggle to scale due to prohibitive labor costs and insufficient +reliability of existing hallucination annotators. To facilitate the scalable +oversight of LLM hallucinations, this paper introduces an iterative +self-training framework that simultaneously and progressively scales up the +hallucination annotation dataset and improves the accuracy of the hallucination +annotator. Based on the Expectation Maximization (EM) algorithm, in each +iteration, the framework first applies a hallucination annotation pipeline to +annotate a scaled dataset and then trains a more accurate hallucination +annotator on the dataset. This new hallucination annotator is adopted in the +hallucination annotation pipeline used for the next iteration. Extensive +experimental results demonstrate that the finally obtained hallucination +annotator with only 7B parameters surpasses the performance of GPT-4 and +obtains new state-of-the-art hallucination detection results on HaluEval and +HalluQA by zero-shot inference. Such an annotator can not only evaluate the +hallucination levels of various LLMs on the large-scale dataset but also help +to mitigate the hallucination of LLMs generations, with the Natural Language +Inference (NLI) metric increasing from 25% to 37% on HaluEval. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Missed Causes and Ambiguous Effects: Counterfactuals Pose Challenges for + Interpreting Neural Networks + + +
+ Interpretability research takes counterfactual theories of causality for +granted. Most causal methods rely on counterfactual interventions to inputs or +the activations of particular model components, followed by observations of the +change in models' output logits or behaviors. While this yields more faithful +evidence than correlational methods, counterfactuals nonetheless have key +problems that bias our findings in specific and predictable ways. Specifically, +(i) counterfactual theories do not effectively capture multiple independently +sufficient causes of the same effect, which leads us to miss certain causes +entirely; and (ii) counterfactual dependencies in neural networks are generally +not transitive, which complicates methods for extracting and interpreting +causal graphs from neural networks. We discuss the implications of these +challenges for interpretability researchers and propose concrete suggestions +for future work. + +
+
+
+
+
+ + ☆ Rethinking Visual Prompting for Multimodal Large Language Models with + External Knowledge + + +
+ In recent years, multimodal large language models (MLLMs) have made +significant strides by training on vast high-quality image-text datasets, +enabling them to generally understand images well. However, the inherent +difficulty in explicitly conveying fine-grained or spatially dense information +in text, such as masks, poses a challenge for MLLMs, limiting their ability to +answer questions requiring an understanding of detailed or localized visual +elements. Drawing inspiration from the Retrieval-Augmented Generation (RAG) +concept, this paper proposes a new visual prompt approach to integrate +fine-grained external knowledge, gleaned from specialized vision models (e.g., +instance segmentation/OCR models), into MLLMs. This is a promising yet +underexplored direction for enhancing MLLMs' performance. Our approach diverges +from concurrent works, which transform external knowledge into additional text +prompts, necessitating the model to indirectly learn the correspondence between +visual content and text coordinates. Instead, we propose embedding fine-grained +knowledge information directly into a spatial embedding map as a visual prompt. +This design can be effortlessly incorporated into various MLLMs, such as LLaVA +and Mipha, considerably improving their visual understanding performance. +Through rigorous experiments, we demonstrate that our method can enhance MLLM +performance across nine benchmarks, amplifying their fine-grained context-aware +capabilities. + +
+
+
+
+
+ + ☆ Lost in Translation: The Algorithmic Gap Between LMs and the Brain + + +
+ Language Models (LMs) have achieved impressive performance on various +linguistic tasks, but their relationship to human language processing in the +brain remains unclear. This paper examines the gaps and overlaps between LMs +and the brain at different levels of analysis, emphasizing the importance of +looking beyond input-output behavior to examine and compare the internal +processes of these systems. We discuss how insights from neuroscience, such as +sparsity, modularity, internal states, and interactive learning, can inform the +development of more biologically plausible language models. Furthermore, we +explore the role of scaling laws in bridging the gap between LMs and human +cognition, highlighting the need for efficiency constraints analogous to those +in biological systems. By developing LMs that more closely mimic brain +function, we aim to advance both artificial intelligence and our understanding +of human cognition. + +
+
+
+
+
+ + ☆ Pretraining End-to-End Keyword Search with Automatically Discovered + Acoustic Units + + +
+ End-to-end (E2E) keyword search (KWS) has emerged as an alternative and +complimentary approach to conventional keyword search which depends on the +output of automatic speech recognition (ASR) systems. While E2E methods greatly +simplify the KWS pipeline, they generally have worse performance than their +ASR-based counterparts, which can benefit from pretraining with untranscribed +data. In this work, we propose a method for pretraining E2E KWS systems with +untranscribed data, which involves using acoustic unit discovery (AUD) to +obtain discrete units for untranscribed data and then learning to locate +sequences of such units in the speech. We conduct experiments across languages +and AUD systems: we show that finetuning such a model significantly outperforms +a model trained from scratch, and the performance improvements are generally +correlated with the quality of the AUD system used for pretraining. + +
+
+ comment: Interspeech 2024. KWS code at: + https://github.com/bolajiy/golden-retriever; AUD code at + https://github.com/beer-asr/beer/tree/master/recipes/hshmm +
+
+
+
+
+ + ☆ Speculative Speech Recognition by Audio-Prefixed Low-Rank Adaptation of + Language Models + + +
+ This paper explores speculative speech recognition (SSR), where we empower +conventional automatic speech recognition (ASR) with speculation capabilities, +allowing the recognizer to run ahead of audio. We introduce a metric for +measuring SSR performance and we propose a model which does SSR by combining a +RNN-Transducer-based ASR system with an audio-prefixed language model (LM). The +ASR system transcribes ongoing audio and feeds the resulting transcripts, along +with an audio-dependent prefix, to the LM, which speculates likely completions +for the transcriptions. We experiment with a variety of ASR datasets on which +show the efficacy our method and the feasibility of SSR as a method of reducing +ASR latency. + +
+
+ comment: Interspeech 2024 +
+
+
+
+
+ + ☆ Entity Decomposition with Filtering: A Zero-Shot Clinical Named Entity + Recognition Framework + + +
+ Clinical named entity recognition (NER) aims to retrieve important entities +within clinical narratives. Recent works have demonstrated that large language +models (LLMs) can achieve strong performance in this task. While previous works +focus on proprietary LLMs, we investigate how open NER LLMs, trained +specifically for entity recognition, perform in clinical NER. In this paper, we +aim to improve them through a novel framework, entity decomposition with +filtering, or EDF. Our key idea is to decompose the entity recognition task +into several retrievals of sub-entity types. We also introduce a filtering +mechanism to remove incorrect entities. Our experimental results demonstrate +the efficacy of our framework across all metrics, models, datasets, and entity +types. Our analysis reveals that entity decomposition can recognize previously +missed entities with substantial improvement. We further provide a +comprehensive evaluation of our framework and an in-depth error analysis to +pave future works. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Learning to (Learn at Test Time): RNNs with Expressive Hidden States + + +
+ Self-attention performs well in long context but has quadratic complexity. +Existing RNN layers have linear complexity, but their performance in long +context is limited by the expressive power of their hidden state. We propose a +new class of sequence modeling layers with linear complexity and an expressive +hidden state. The key idea is to make the hidden state a machine learning model +itself, and the update rule a step of self-supervised learning. Since the +hidden state is updated by training even on test sequences, our layers are +called Test-Time Training (TTT) layers. We consider two instantiations: +TTT-Linear and TTT-MLP, whose hidden state is a linear model and a two-layer +MLP respectively. We evaluate our instantiations at the scale of 125M to 1.3B +parameters, comparing with a strong Transformer and Mamba, a modern RNN. Both +TTT-Linear and TTT-MLP match or exceed the baselines. Similar to Transformer, +they can keep reducing perplexity by conditioning on more tokens, while Mamba +cannot after 16k context. With preliminary systems optimization, TTT-Linear is +already faster than Transformer at 8k context and matches Mamba in wall-clock +time. TTT-MLP still faces challenges in memory I/O, but shows larger potential +in long context, pointing to a promising direction for future research. + +
+
+
+
+
+ + ☆ ARM: Efficient Guided Decoding with Autoregressive Reward Models + + +
+ Language models trained on large amounts of data require careful tuning to be +safely deployed in real world. We revisit the guided decoding paradigm, where +the goal is to augment the logits of the base language model using the scores +from a task-specific reward model. We propose a simple but efficient +parameterization of the autoregressive reward model enabling fast and effective +guided decoding. On detoxification and sentiment control tasks, we show that +our efficient parameterization performs on par with RAD, a strong but less +efficient guided decoding approach. + +
+
+
+
+
+ + ☆ Written Term Detection Improves Spoken Term Detection + + +
+ End-to-end (E2E) approaches to keyword search (KWS) are considerably simpler +in terms of training and indexing complexity when compared to approaches which +use the output of automatic speech recognition (ASR) systems. This +simplification however has drawbacks due to the loss of modularity. In +particular, where ASR-based KWS systems can benefit from external unpaired text +via a language model, current formulations of E2E KWS systems have no such +mechanism. Therefore, in this paper, we propose a multitask training objective +which allows unpaired text to be integrated into E2E KWS without complicating +indexing and search. In addition to training an E2E KWS model to retrieve text +queries from spoken documents, we jointly train it to retrieve text queries +from masked written documents. We show empirically that this approach can +effectively leverage unpaired text for KWS, with significant improvements in +search performance across a wide variety of languages. We conduct analysis +which indicates that these improvements are achieved because the proposed +method improves document representations for words in the unpaired text. +Finally, we show that the proposed method can be used for domain adaptation in +settings where in-domain paired data is scarce or nonexistent. + +
+
+ comment: IEEE/ACM Transactions on Audio, Speech and Language Processing + (TASLP), 2024. Code at https://github.com/bolajiy/golden-retriever +
+
+
+
+
+ + ☆ Testing learning hypotheses using neural networks by manipulating + learning data + + +
+ Although passivization is productive in English, it is not completely general +-- some exceptions exist (e.g. *One hour was lasted by the meeting). How do +English speakers learn these exceptions to an otherwise general pattern? Using +neural network language models as theories of acquisition, we explore the +sources of indirect evidence that a learner can leverage to learn whether a +verb can passivize. We first characterize English speakers' judgments of +exceptions to the passive, confirming that speakers find some verbs more +passivizable than others. We then show that a neural network language model can +learn restrictions to the passive that are similar to those displayed by +humans, suggesting that evidence for these exceptions is available in the +linguistic input. We test the causal role of two hypotheses for how the +language model learns these restrictions by training models on modified +training corpora, which we create by altering the existing training corpora to +remove features of the input implicated by each hypothesis. We find that while +the frequency with which a verb appears in the passive significantly affects +its passivizability, the semantics of the verb does not. This study highlight +the utility of altering a language model's training data for answering +questions where complete control over a learner's input is vital. + +
+
+ comment: Submitted to Journal of Memory and Language +
+
+
+
+
+ + ☆ VRSD: Rethinking Similarity and Diversity for Retrieval in Large + Language Models + + +
+ Vector retrieval algorithms are vital for semantic queries in the evolving +landscape of Large Language Models (LLMs). Retrieving vectors that +simultaneously meet criteria for both similarity and diversity significantly +enhances the capabilities of LLM-based agents. Despite the widespread use of +the Maximal Marginal Relevance (MMR) in retrieval scenarios with relevance and +diversity requirements, fluctuations caused by variations in the parameter $ +\lambda $ within the MMR complicate the determination of the optimization +trajectory in vector spaces, thus obscuring the direction of enhancement. +Moreover, there is a lack of a robust theoretical analysis for the constraints +of similarity and diversity in retrieval processes. This paper introduces a +novel approach to characterizing both constraints through the relationship +between the sum vector and the query vector. The proximity of these vectors +addresses the similarity constraint, while necessitating that individual +vectors within the sum vector divergently align with the query vector to +satisfy the diversity constraint. We also formulate a new combinatorial +optimization challenge, taking a selection of $k$ vectors from a set of +candidates such that their sum vector maximally aligns with the query vector, a +problem we demonstrate to be NP-complete. This establishes the profound +difficulty of pursuing similarity and diversity simultaneously in vector +retrieval and lays a theoretical groundwork for further research. Additionally, +we present the heuristic algorithm Vectors Retrieval with Similarity and +Diversity (VRSD) which not only has a definitive optimization goal and eschews +the need for preset parameters but also offers a modest reduction in time +complexity compared to MMR. Empirical validation further confirm that VRSD +significantly surpasses MMR across various datasets. + +
+
+
+
+
+ + ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More + than Measuring Coherence, Grounding, and Repetition + + +
+ Visual storytelling consists in generating a natural language story given a +temporally ordered sequence of images. This task is not only challenging for +models, but also very difficult to evaluate with automatic metrics since there +is no consensus about what makes a story 'good'. In this paper, we introduce a +novel method that measures story quality in terms of human likeness regarding +three key aspects highlighted in previous work: visual grounding, coherence, +and repetitiveness. We then use this method to evaluate the stories generated +by several models, showing that the foundation model LLaVA obtains the best +result, but only slightly so compared to TAPM, a 50-times smaller visual +storytelling model. Upgrading the visual and language components of TAPM +results in a model that yields competitive performance with a relatively low +number of parameters. Finally, we carry out a human evaluation study, whose +results suggest that a 'good' story may require more than a human-like level of +visual grounding, coherence, and repetition. + +
+
+
+
+
+ + ☆ Spontaneous Reward Hacking in Iterative Self-Refinement + + +
+ Language models are capable of iteratively improving their outputs based on +natural language feedback, thus enabling in-context optimization of user +preference. In place of human users, a second language model can be used as an +evaluator, providing feedback along with numerical ratings which the generator +attempts to optimize. However, because the evaluator is an imperfect proxy of +user preference, this optimization can lead to reward hacking, where the +evaluator's ratings improve while the generation quality remains stagnant or +even decreases as judged by actual user preference. The concern of reward +hacking is heightened in iterative self-refinement where the generator and the +evaluator use the same underlying language model, in which case the +optimization pressure can drive them to exploit shared vulnerabilities. Using +an essay editing task, we show that iterative self-refinement leads to +deviation between the language model evaluator and human judgment, +demonstrating that reward hacking can occur spontaneously in-context with the +use of iterative self-refinement. In addition, we study conditions under which +reward hacking occurs and observe two factors that affect reward hacking +severity: model size and context sharing between the generator and the +evaluator. + +
+
+
+
+
+ + ☆ Strengthening Structural Inductive Biases by Pre-training to Perform + Syntactic Transformations + + +
+ Models need appropriate inductive biases to effectively learn from small +amounts of data and generalize systematically outside of the training +distribution. While Transformers are highly versatile and powerful, they can +still benefit from enhanced structural inductive biases for seq2seq tasks, +especially those involving syntactic transformations, such as converting active +to passive voice or semantic parsing. In this paper, we propose to strengthen +the structural inductive bias of a Transformer by intermediate pre-training to +perform synthetically generated syntactic transformations of dependency trees +given a description of the transformation. Our experiments confirm that this +helps with few-shot learning of syntactic tasks such as chunking, and also +improves structural generalization for semantic parsing. Our analysis shows +that the intermediate pre-training leads to attention heads that keep track of +which syntactic transformation needs to be applied to which token, and that the +model can leverage these attention heads on downstream tasks. + +
+
+
+
+
+ + ☆ PoPreRo: A New Dataset for Popularity Prediction of Romanian Reddit + Posts ICPR 2024 + + +
+ We introduce PoPreRo, the first dataset for Popularity Prediction of Romanian +posts collected from Reddit. The PoPreRo dataset includes a varied compilation +of post samples from five distinct subreddits of Romania, totaling 28,107 data +samples. Along with our novel dataset, we introduce a set of competitive models +to be used as baselines for future research. Interestingly, the top-scoring +model achieves an accuracy of 61.35% and a macro F1 score of 60.60% on the test +set, indicating that the popularity prediction task on PoPreRo is very +challenging. Further investigations based on few-shot prompting the Falcon-7B +Large Language Model also point in the same direction. We thus believe that +PoPreRo is a valuable resource that can be used to evaluate models on +predicting the popularity of social media posts in Romanian. We release our +dataset at https://github.com/ana-rogoz/PoPreRo. + +
+
+ comment: Accepted at ICPR 2024 +
+
+
+
+
+ + ☆ Performance Analysis of Speech Encoders for Low-Resource SLU and ASR in + Tunisian Dialect + + +
+ Speech encoders pretrained through self-supervised learning (SSL) have +demonstrated remarkable performance in various downstream tasks, including +Spoken Language Understanding (SLU) and Automatic Speech Recognition (ASR). For +instance, fine-tuning SSL models for such tasks has shown significant +potential, leading to improvements in the SOTA performance across challenging +datasets. In contrast to existing research, this paper contributes by comparing +the effectiveness of SSL approaches in the context of (i) the low-resource +spoken Tunisian Arabic dialect and (ii) its combination with a low-resource SLU +and ASR scenario, where only a few semantic annotations are available for +fine-tuning. We conduct experiments using many SSL speech encoders on the +TARIC-SLU dataset. We use speech encoders that were pre-trained on either +monolingual or multilingual speech data. Some of them have also been refined +without in-domain nor Tunisian data through multimodal supervised +teacher-student paradigm. This study yields numerous significant findings that +we are discussing in this paper. + +
+
+ comment: Accepted in ArabicNLP 2024 +
+
+
+
+
+ + ☆ GPT vs RETRO: Exploring the Intersection of Retrieval and + Parameter-Efficient Fine-Tuning + + +
+ Parameter-Efficient Fine-Tuning (PEFT) and Retrieval-Augmented Generation +(RAG) have become popular methods for adapting large language models while +minimizing compute requirements. In this paper, we apply PEFT methods +(P-tuning, Adapters, and LoRA) to a modified Retrieval-Enhanced Transformer +(RETRO) and a baseline GPT model across several sizes, ranging from 823 million +to 48 billion parameters. We show that RETRO models outperform GPT models in +zero-shot settings due to their unique pre-training process but GPT models have +higher performance potential with PEFT. Additionally, our study indicates that +8B parameter models strike an optimal balance between cost and performance and +P-tuning lags behind other PEFT techniques. We further provide a comparative +analysis of between applying PEFT to an Instruction-tuned RETRO model and base +RETRO model. This work presents the first comprehensive comparison of various +PEFT methods integrated with RAG, applied to both GPT and RETRO models, +highlighting their relative performance. + +
+
+
+
+
+ + ☆ Leveraging Graph Structures to Detect Hallucinations in Large Language + Models + + +
+ Large language models are extensively applied across a wide range of tasks, +such as customer support, content creation, educational tutoring, and providing +financial guidance. However, a well-known drawback is their predisposition to +generate hallucinations. This damages the trustworthiness of the information +these models provide, impacting decision-making and user confidence. We propose +a method to detect hallucinations by looking at the structure of the latent +space and finding associations within hallucinated and non-hallucinated +generations. We create a graph structure that connects generations that lie +closely in the embedding space. Moreover, we employ a Graph Attention Network +which utilizes message passing to aggregate information from neighboring nodes +and assigns varying degrees of importance to each neighbor based on their +relevance. Our findings show that 1) there exists a structure in the latent +space that differentiates between hallucinated and non-hallucinated +generations, 2) Graph Attention Networks can learn this structure and +generalize it to unseen generations, and 3) the robustness of our method is +enhanced when incorporating contrastive learning. When evaluated against +evidence-based benchmarks, our model performs similarly without access to +search-based methods. + +
+
+
+
+
+ + ☆ Controlling Whisper: Universal Acoustic Adversarial Attacks to Control + Speech Foundation Models + + +
+ Speech enabled foundation models, either in the form of flexible speech +recognition based systems or audio-prompted large language models (LLMs), are +becoming increasingly popular. One of the interesting aspects of these models +is their ability to perform tasks other than automatic speech recognition (ASR) +using an appropriate prompt. For example, the OpenAI Whisper model can perform +both speech transcription and speech translation. With the development of +audio-prompted LLMs there is the potential for even greater control options. In +this work we demonstrate that with this greater flexibility the systems can be +susceptible to model-control adversarial attacks. Without any access to the +model prompt it is possible to modify the behaviour of the system by +appropriately changing the audio input. To illustrate this risk, we demonstrate +that it is possible to prepend a short universal adversarial acoustic segment +to any input speech signal to override the prompt setting of an ASR foundation +model. Specifically, we successfully use a universal adversarial acoustic +segment to control Whisper to always perform speech translation, despite being +set to perform speech transcription. Overall, this work demonstrates a new form +of adversarial attack on multi-tasking speech enabled foundation models that +needs to be considered prior to the deployment of this form of model. + +
+
+
+
+
+ + ☆ EventChat: Implementation and user-centric evaluation of a large + language model-driven conversational recommender system for exploring leisure + events in an SME context + + +
+ Large language models (LLMs) present an enormous evolution in the strategic +potential of conversational recommender systems (CRS). Yet to date, research +has predominantly focused upon technical frameworks to implement LLM-driven +CRS, rather than end-user evaluations or strategic implications for firms, +particularly from the perspective of a small to medium enterprises (SME) that +makeup the bedrock of the global economy. In the current paper, we detail the +design of an LLM-driven CRS in an SME setting, and its subsequent performance +in the field using both objective system metrics and subjective user +evaluations. While doing so, we additionally outline a short-form revised +ResQue model for evaluating LLM-driven CRS, enabling replicability in a rapidly +evolving field. Our results reveal good system performance from a user +experience perspective (85.5% recommendation accuracy) but underscore latency, +cost, and quality issues challenging business viability. Notably, with a median +cost of $0.04 per interaction and a latency of 5.7s, cost-effectiveness and +response time emerge as crucial areas for achieving a more user-friendly and +economically viable LLM-driven CRS for SME settings. One major driver of these +costs is the use of an advanced LLM as a ranker within the retrieval-augmented +generation (RAG) technique. Our results additionally indicate that relying +solely on approaches such as Prompt-based learning with ChatGPT as the +underlying LLM makes it challenging to achieve satisfying quality in a +production environment. Strategic considerations for SMEs deploying an +LLM-driven CRS are outlined, particularly considering trade-offs in the current +technical landscape. + +
+
+ comment: 27 pages, 3 tables, 5 figures, pre-print manuscript +
+
+
+
+
+ + ☆ Are Large Language Models Strategic Decision Makers? A Study of + Performance and Bias in Two-Player Non-Zero-Sum Games + + +
+ Large Language Models (LLMs) have been increasingly used in real-world +settings, yet their strategic abilities remain largely unexplored. Game theory +provides a good framework for assessing the decision-making abilities of LLMs +in interactions with other agents. Although prior studies have shown that LLMs +can solve these tasks with carefully curated prompts, they fail when the +problem setting or prompt changes. In this work we investigate LLMs' behaviour +in strategic games, Stag Hunt and Prisoner Dilemma, analyzing performance +variations under different settings and prompts. Our results show that the +tested state-of-the-art LLMs exhibit at least one of the following systematic +biases: (1) positional bias, (2) payoff bias, or (3) behavioural bias. +Subsequently, we observed that the LLMs' performance drops when the game +configuration is misaligned with the affecting biases. Performance is assessed +based on the selection of the correct action, one which agrees with the +prompted preferred behaviours of both players. Alignment refers to whether the +LLM's bias aligns with the correct action. For example, GPT-4o's average +performance drops by 34% when misaligned. Additionally, the current trend of +"bigger and newer is better" does not hold for the above, where GPT-4o (the +current best-performing LLM) suffers the most substantial performance drop. +Lastly, we note that while chain-of-thought prompting does reduce the effect of +the biases on most models, it is far from solving the problem at the +fundamental level. + +
+
+ comment: 8 pages (19 with appendix), 6 figures in the main body (4 in the + appendix), 4 tables in the main body +
+
+
+
+
+ + ☆ Using LLMs to label medical papers according to the CIViC evidence model + + +
+ We introduce the sequence classification problem CIViC Evidence to the field +of medical NLP. CIViC Evidence denotes the multi-label classification problem +of assigning labels of clinical evidence to abstracts of scientific papers +which have examined various combinations of genomic variants, cancer types, and +treatment approaches. We approach CIViC Evidence using different language +models: We fine-tune pretrained checkpoints of BERT and RoBERTa on the CIViC +Evidence dataset and challenge their performance with models of the same +architecture which have been pretrained on domain-specific text. In this +context, we find that BiomedBERT and BioLinkBERT can outperform BERT on CIViC +Evidence (+0.8% and +0.9% absolute improvement in class-support weighted F1 +score). All transformer-based models show a clear performance edge when +compared to a logistic regression trained on bigram tf-idf scores (+1.5 - 2.7% +improved F1 score). We compare the aforementioned BERT-like models to OpenAI's +GPT-4 in a few-shot setting (on a small subset of our original test dataset), +demonstrating that, without additional prompt-engineering or fine-tuning, GPT-4 +performs worse on CIViC Evidence than our six fine-tuned models (66.1% weighted +F1 score compared to 71.8% for the best fine-tuned model). However, performance +gets reasonably close to the benchmark of a logistic regression model trained +on bigram tf-idf scores (67.7% weighted F1 score). + +
+
+
+
+
+ + ☆ Generalists vs. Specialists: Evaluating Large Language Models for Urdu + + +
+ In this paper, we compare general-purpose pretrained models, GPT-4-Turbo and +Llama-3-8b-Instruct with special-purpose models fine-tuned on specific tasks, +XLM-Roberta-large, mT5-large, and Llama-3-8b-Instruct. We focus on seven +classification and six generation tasks to evaluate the performance of these +models on Urdu language. Urdu has 70 million native speakers, yet it remains +underrepresented in Natural Language Processing (NLP). Despite the frequent +advancements in Large Language Models (LLMs), their performance in low-resource +languages, including Urdu, still needs to be explored. We also conduct a human +evaluation for the generation tasks and compare the results with the +evaluations performed by GPT-4-Turbo and Llama-3-8b-Instruct. We find that +special-purpose models consistently outperform general-purpose models across +various tasks. We also find that the evaluation done by GPT-4-Turbo for +generation tasks aligns more closely with human evaluation compared to the +evaluation by Llama-3-8b-Instruct. This paper contributes to the NLP community +by providing insights into the effectiveness of general and specific-purpose +LLMs for low-resource languages. + +
+
+
+
+
+ + ☆ TokenVerse: Unifying Speech and NLP Tasks via Transducer-based ASR + + +
+ In traditional conversational intelligence from speech, a cascaded pipeline +is used, involving tasks such as voice activity detection, diarization, +transcription, and subsequent processing with different NLP models for tasks +like semantic endpointing and named entity recognition (NER). Our paper +introduces TokenVerse, a single Transducer-based model designed to handle +multiple tasks. This is achieved by integrating task-specific tokens into the +reference text during ASR model training, streamlining the inference and +eliminating the need for separate NLP models. In addition to ASR, we conduct +experiments on 3 different tasks: speaker change detection, endpointing, and +NER. Our experiments on a public and a private dataset show that the proposed +method improves ASR by up to 7.7% in relative WER while outperforming the +cascaded pipeline approach in individual task performance. Additionally, we +present task transfer learning to a new task within an existing TokenVerse. + +
+
+ comment: 5 pages, double column +
+
+
+
+
+ + ☆ From 'Showgirls' to 'Performers': Fine-tuning with Gender-inclusive + Language for Bias Reduction in LLMs ACL 2024 + + +
+ Gender bias is not only prevalent in Large Language Models (LLMs) and their +training data, but also firmly ingrained into the structural aspects of +language itself. Therefore, adapting linguistic structures within LLM training +data to promote gender-inclusivity can make gender representations within the +model more inclusive. The focus of our work are gender-exclusive affixes in +English, such as in 'show-girl' or 'man-cave', which can perpetuate gender +stereotypes and binary conceptions of gender. We use an LLM training dataset to +compile a catalogue of 692 gender-exclusive terms along with gender-neutral +variants and from this, develop a gender-inclusive fine-tuning dataset, the +'Tiny Heap'. Fine-tuning three different LLMs with this dataset, we observe an +overall reduction in gender-stereotyping tendencies across the models. Our +approach provides a practical method for enhancing gender inclusivity in LLM +training data and contributes to incorporating queer-feminist linguistic +activism in bias mitigation research in NLP. + +
+
+ comment: 10 pages, 5 tables; to appear in Proceedings of the 5th Workshop on + Gender Bias in Natural Language Processing at ACL 2024 +
+
+
+
+
+ + ☆ Waterfall: Framework for Robust and Scalable Text Watermarking + + +
+ Protecting intellectual property (IP) of text such as articles and code is +increasingly important, especially as sophisticated attacks become possible, +such as paraphrasing by large language models (LLMs) or even unauthorized +training of LLMs on copyrighted text to infringe such IP. However, existing +text watermarking methods are not robust enough against such attacks nor +scalable to millions of users for practical implementation. In this paper, we +propose Waterfall, the first training-free framework for robust and scalable +text watermarking applicable across multiple text types (e.g., articles, code) +and languages supportable by LLMs, for general text and LLM data provenance. +Waterfall comprises several key innovations, such as being the first to use LLM +as paraphrasers for watermarking along with a novel combination of techniques +that are surprisingly effective in achieving robust verifiability and +scalability. We empirically demonstrate that Waterfall achieves significantly +better scalability, robust verifiability, and computational efficiency compared +to SOTA article-text watermarking methods, and also showed how it could be +directly applied to the watermarking of code. + +
+
+
+
+
+ + ☆ Romanization Encoding For Multilingual ASR + + +
+ We introduce romanization encoding for script-heavy languages to optimize +multilingual and code-switching Automatic Speech Recognition (ASR) systems. By +adopting romanization encoding alongside a balanced concatenated tokenizer +within a FastConformer-RNNT framework equipped with a Roman2Char module, we +significantly reduce vocabulary and output dimensions, enabling larger training +batches and reduced memory consumption. Our method decouples acoustic modeling +and language modeling, enhancing the flexibility and adaptability of the +system. In our study, applying this method to Mandarin-English ASR resulted in +a remarkable 63.51% vocabulary reduction and notable performance gains of +13.72% and 15.03% on SEAME code-switching benchmarks. Ablation studies on +Mandarin-Korean and Mandarin-Japanese highlight our method's strong capability +to address the complexities of other script-heavy languages, paving the way for +more versatile and effective multilingual ASR systems. + +
+
+
+
+
+ + ☆ Crafting Large Language Models for Enhanced Interpretability ICML 2024 + + +
+ We introduce the Concept Bottleneck Large Language Model (CB-LLM), a +pioneering approach to creating inherently interpretable Large Language Models +(LLMs). Unlike traditional black-box LLMs that rely on post-hoc interpretation +methods with limited neuron function insights, CB-LLM sets a new standard with +its built-in interpretability, scalability, and ability to provide clear, +accurate explanations. This innovation not only advances transparency in +language models but also enhances their effectiveness. Our unique Automatic +Concept Correction (ACC) strategy successfully narrows the performance gap with +conventional black-box LLMs, positioning CB-LLM as a model that combines the +high accuracy of traditional LLMs with the added benefit of clear +interpretability -- a feature markedly absent in existing LLMs. + +
+
+ comment: Present at ICML 2024 Mechanistic Interpretability (MI) Workshop +
+
+
+
+
+ + ☆ Jailbreak Attacks and Defenses Against Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have performed exceptionally in various +text-generative tasks, including question answering, translation, code +completion, etc. However, the over-assistance of LLMs has raised the challenge +of "jailbreaking", which induces the model to generate malicious responses +against the usage policy and society by designing adversarial prompts. With the +emergence of jailbreak attack methods exploiting different vulnerabilities in +LLMs, the corresponding safety alignment measures are also evolving. In this +paper, we propose a comprehensive and detailed taxonomy of jailbreak attack and +defense methods. For instance, the attack methods are divided into black-box +and white-box attacks based on the transparency of the target model. Meanwhile, +we classify defense methods into prompt-level and model-level defenses. +Additionally, we further subdivide these attack and defense methods into +distinct sub-classes and present a coherent diagram illustrating their +relationships. We also conduct an investigation into the current evaluation +methods and compare them from different perspectives. Our findings aim to +inspire future research and practical implementations in safeguarding LLMs +against adversarial attacks. Above all, although jailbreak remains a +significant concern within the community, we believe that our work enhances the +understanding of this domain and provides a foundation for developing more +secure LLMs. + +
+
+
+
+
+ + ☆ Systematic Evaluation of Online Speaker Diarization Systems Regarding + their Latency + + +
+ In this paper, different online speaker diarization systems are evaluated on +the same hardware with the same test data with regard to their latency. The +latency is the time span from audio input to the output of the corresponding +speaker label. As part of the evaluation, various model combinations within the +DIART framework, a diarization system based on the online clustering algorithm +UIS-RNN-SML, and the end-to-end online diarization system FS-EEND are compared. +The lowest latency is achieved for the DIART-pipeline with the embedding model +pyannote/embedding and the segmentation model pyannote/segmentation. The +FS-EEND system shows a similarly good latency. In general there is currently no +published research that compares several online diarization systems in terms of +their latency. This makes this work even more relevant. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ LearnerVoice: A Dataset of Non-Native English Learners' Spontaneous + Speech INTERSPEECH 2024 + + +
+ Prevalent ungrammatical expressions and disfluencies in spontaneous speech +from second language (L2) learners pose unique challenges to Automatic Speech +Recognition (ASR) systems. However, few datasets are tailored to L2 learner +speech. We publicly release LearnerVoice, a dataset consisting of 50.04 hours +of audio and transcriptions of L2 learners' spontaneous speech. Our linguistic +analysis reveals that transcriptions in our dataset contain L2S (L2 learner's +Spontaneous speech) features, consisting of ungrammatical expressions and +disfluencies (e.g., filler words, word repetitions, self-repairs, false +starts), significantly more than native speech datasets. Fine-tuning +whisper-small.en with LearnerVoice achieves a WER of 10.26%, 44.2% lower than +vanilla whisper-small.en. Furthermore, our qualitative analysis indicates that +54.2% of errors from the vanilla model on LearnerVoice are attributable to L2S +features, with 48.1% of them being reduced in the fine-tuned model. + +
+
+ comment: Accepted for INTERSPEECH 2024 +
+
+
+
+
+ + ☆ BiosERC: Integrating Biography Speakers Supported by LLMs for ERC Tasks ICANN 2024 + + +
+ In the Emotion Recognition in Conversation task, recent investigations have +utilized attention mechanisms exploring relationships among utterances from +intra- and inter-speakers for modeling emotional interaction between them. +However, attributes such as speaker personality traits remain unexplored and +present challenges in terms of their applicability to other tasks or +compatibility with diverse model architectures. Therefore, this work introduces +a novel framework named BiosERC, which investigates speaker characteristics in +a conversation. By employing Large Language Models (LLMs), we extract the +"biographical information" of the speaker within a conversation as +supplementary knowledge injected into the model to classify emotional labels +for each utterance. Our proposed method achieved state-of-the-art (SOTA) +results on three famous benchmark datasets: IEMOCAP, MELD, and EmoryNLP, +demonstrating the effectiveness and generalization of our model and showcasing +its potential for adaptation to various conversation analysis tasks. Our source +code is available at https://github.com/yingjie7/BiosERC. + +
+
+ comment: Accepted in the 33rd International Conference on Artificial Neural + Networks (ICANN 2024) +
+
+
+
+
+ + ☆ Unified Interpretation of Smoothing Methods for Negative Sampling Loss + Functions in Knowledge Graph Embedding RepL4NLP + + +
+ Knowledge Graphs (KGs) are fundamental resources in knowledge-intensive tasks +in NLP. Due to the limitation of manually creating KGs, KG Completion (KGC) has +an important role in automatically completing KGs by scoring their links with +KG Embedding (KGE). To handle many entities in training, KGE relies on Negative +Sampling (NS) loss that can reduce the computational cost by sampling. Since +the appearance frequencies for each link are at most one in KGs, sparsity is an +essential and inevitable problem. The NS loss is no exception. As a solution, +the NS loss in KGE relies on smoothing methods like Self-Adversarial Negative +Sampling (SANS) and subsampling. However, it is uncertain what kind of +smoothing method is suitable for this purpose due to the lack of theoretical +understanding. This paper provides theoretical interpretations of the smoothing +methods for the NS loss in KGE and induces a new NS loss, Triplet Adaptive +Negative Sampling (TANS), that can cover the characteristics of the +conventional smoothing methods. Experimental results of TransE, DistMult, +ComplEx, RotatE, HAKE, and HousE on FB15k-237, WN18RR, and YAGO3-10 datasets +and their sparser subsets show the soundness of our interpretation and +performance improvement by our TANS. + +
+
+ comment: 9 pages, 4 figures, 2 tables; accepted to workshop RepL4NLP held in + conjunction with ACL 2024 +
+
+
+
+
+ + ☆ ArAIEval Shared Task: Propagandistic Techniques Detection in Unimodal + and Multimodal Arabic Content + + +
+ We present an overview of the second edition of the ArAIEval shared task, +organized as part of the ArabicNLP 2024 conference co-located with ACL 2024. In +this edition, ArAIEval offers two tasks: (i) detection of propagandistic +textual spans with persuasion techniques identification in tweets and news +articles, and (ii) distinguishing between propagandistic and non-propagandistic +memes. A total of 14 teams participated in the final evaluation phase, with 6 +and 9 teams participating in Tasks 1 and 2, respectively. Finally, 11 teams +submitted system description papers. Across both tasks, we observed that +fine-tuning transformer models such as AraBERT was at the core of the majority +of the participating systems. We provide a description of the task setup, +including a description of the dataset construction and the evaluation setup. +We further provide a brief overview of the participating systems. All datasets +and evaluation scripts are released to the research community +(https://araieval.gitlab.io/). We hope this will enable further research on +these important tasks in Arabic. + +
+
+ comment: propaganda, span detection, disinformation, misinformation, fake + news, LLMs, GPT-4, multimodality, multimodal LLMs +
+
+
+
+
+ + ♻ ☆ mPLM-Sim: Better Cross-Lingual Similarity and Transfer in Multilingual + Pretrained Language Models EACL 2024 + + +
+ Recent multilingual pretrained language models (mPLMs) have been shown to +encode strong language-specific signals, which are not explicitly provided +during pretraining. It remains an open question whether it is feasible to +employ mPLMs to measure language similarity, and subsequently use the +similarity results to select source languages for boosting cross-lingual +transfer. To investigate this, we propose mPLMSim, a language similarity +measure that induces the similarities across languages from mPLMs using +multi-parallel corpora. Our study shows that mPLM-Sim exhibits moderately high +correlations with linguistic similarity measures, such as lexicostatistics, +genealogical language family, and geographical sprachbund. We also conduct a +case study on languages with low correlation and observe that mPLM-Sim yields +more accurate similarity results. Additionally, we find that similarity results +vary across different mPLMs and different layers within an mPLM. We further +investigate whether mPLMSim is effective for zero-shot cross-lingual transfer +by conducting experiments on both low-level syntactic tasks and high-level +semantic tasks. The experimental results demonstrate that mPLM-Sim is capable +of selecting better source languages than linguistic measures, resulting in a +1%-2% improvement in zero-shot cross-lingual transfer performance. + +
+
+ comment: EACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ OpenDebateEvidence: A Massive-Scale Argument Mining and Summarization + Dataset ACL2024 + + +
+ We introduce OpenDebateEvidence, a comprehensive dataset for argument mining +and summarization sourced from the American Competitive Debate community. This +dataset includes over 3.5 million documents with rich metadata, making it one +of the most extensive collections of debate evidence. OpenDebateEvidence +captures the complexity of arguments in high school and college debates, +providing valuable resources for training and evaluation. Our extensive +experiments demonstrate the efficacy of fine-tuning state-of-the-art large +language models for argumentative abstractive summarization across various +methods, models, and datasets. By providing this comprehensive resource, we aim +to advance computational argumentation and support practical applications for +debaters, educators, and researchers. OpenDebateEvidence is publicly available +to support further research and innovation in computational argumentation. +Access it here: https://huggingface.co/datasets/Yusuf5/OpenCaselist + +
+
+ comment: Accepted for Publication to ARGMIN 2024 at ACL2024 +
+
+
+
+
+ + ♻ ☆ Improving Low-Resource Knowledge Tracing Tasks by Supervised + Pre-training and Importance Mechanism Fine-tuning + + +
+ Knowledge tracing (KT) aims to estimate student's knowledge mastery based on +their historical interactions. Recently, the deep learning based KT (DLKT) +approaches have achieved impressive performance in the KT task. These DLKT +models heavily rely on the large number of available student interactions. +However, due to various reasons such as budget constraints and privacy +concerns, observed interactions are very limited in many real-world scenarios, +a.k.a, low-resource KT datasets. Directly training a DLKT model on a +low-resource KT dataset may lead to overfitting and it is difficult to choose +the appropriate deep neural architecture. Therefore, in this paper, we propose +a low-resource KT framework called LoReKT to address above challenges. Inspired +by the prevalent "pre-training and fine-tuning" paradigm, we aim to learn +transferable parameters and representations from rich-resource KT datasets +during the pre-training stage and subsequently facilitate effective adaptation +to low-resource KT datasets. Specifically, we simplify existing sophisticated +DLKT model architectures with purely a stack of transformer decoders. We design +an encoding mechanism to incorporate student interactions from multiple KT data +sources and develop an importance mechanism to prioritize updating parameters +with high importance while constraining less important ones during the +fine-tuning stage. We evaluate LoReKT on six public KT datasets and +experimental results demonstrate the superiority of our approach in terms of +AUC and Accuracy. To encourage reproducible research, we make our data and code +publicly available at https://anonymous.4open.science/r/LoReKT-C619. + +
+
+ comment: 29 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ How Similar Are Elected Politicians and Their Constituents? Quantitative + Evidence From Online Social Networks + + +
+ How similar are politicians to those who vote for them? This is a critical +question at the heart of democratic representation and particularly relevant at +times when political dissatisfaction and populism are on the rise. To answer +this question we compare the online discourse of elected politicians and their +constituents. We collect a two and a half years (September 2020 - February +2023) constituency-level dataset for USA and UK that includes: (i) the Twitter +timelines (5.6 Million tweets) of elected political representatives (595 UK +Members of Parliament and 433 USA Representatives), (ii) the Nextdoor posts +(21.8 Million posts) of the constituency (98.4% USA and 91.5% UK +constituencies). We find that elected politicians tend to be equally similar to +their constituents in terms of content and style regardless of whether a +constituency elects a right or left-wing politician. The size of the electoral +victory and the level of income of a constituency shows a nuanced picture. The +narrower the electoral victory, the more similar the style and the more +dissimilar the content is. The lower the income of a constituency, the more +similar the content is. In terms of style, poorer constituencies tend to have a +more similar sentiment and more dissimilar psychological text traits (i.e. +measured with LIWC categories). + +
+
+
+
+
+ + ♻ ☆ From Representational Harms to Quality-of-Service Harms: A Case Study on + Llama 2 Safety Safeguards ACL 2024 + + +
+ Recent progress in large language models (LLMs) has led to their widespread +adoption in various domains. However, these advancements have also introduced +additional safety risks and raised concerns regarding their detrimental impact +on already marginalized populations. Despite growing mitigation efforts to +develop safety safeguards, such as supervised safety-oriented fine-tuning and +leveraging safe reinforcement learning from human feedback, multiple concerns +regarding the safety and ingrained biases in these models remain. Furthermore, +previous work has demonstrated that models optimized for safety often display +exaggerated safety behaviors, such as a tendency to refrain from responding to +certain requests as a precautionary measure. As such, a clear trade-off between +the helpfulness and safety of these models has been documented in the +literature. In this paper, we further investigate the effectiveness of safety +measures by evaluating models on already mitigated biases. Using the case of +Llama 2 as an example, we illustrate how LLMs' safety responses can still +encode harmful assumptions. To do so, we create a set of non-toxic prompts, +which we then use to evaluate Llama models. Through our new taxonomy of LLMs +responses to users, we observe that the safety/helpfulness trade-offs are more +pronounced for certain demographic groups which can lead to quality-of-service +harms for marginalized populations. + +
+
+ comment: 9 pages, 4 figures. Accepted to Findings of the Association for + Computational Linguistics: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Steering Llama 2 via Contrastive Activation Addition + + +
+ We introduce Contrastive Activation Addition (CAA), an innovative method for +steering language models by modifying their activations during forward passes. +CAA computes "steering vectors" by averaging the difference in residual stream +activations between pairs of positive and negative examples of a particular +behavior, such as factual versus hallucinatory responses. During inference, +these steering vectors are added at all token positions after the user's prompt +with either a positive or negative coefficient, allowing precise control over +the degree of the targeted behavior. We evaluate CAA's effectiveness on Llama 2 +Chat using multiple-choice behavioral question datasets and open-ended +generation tasks. We demonstrate that CAA significantly alters model behavior, +is effective over and on top of traditional methods like finetuning and system +prompt design, and minimally reduces capabilities. Moreover, we gain deeper +insights into CAA's mechanisms by employing various activation space +interpretation methods. CAA accurately steers model outputs and sheds light on +how high-level concepts are represented in Large Language Models (LLMs). + +
+
+
+
+
+ + ♻ ☆ Planning with Logical Graph-based Language Model for Instruction + Generation + + +
+ Despite the superior performance of large language models to generate natural +language texts, it is hard to generate texts with correct logic according to a +given task, due to the difficulties for neural models to capture implied rules +from free-form texts. In this paper, we propose a novel graph-based language +model, Logical-GLM, to infuse logic into language models for more valid text +generation and interpretability. Specifically, we first capture information +from natural language instructions and construct logical bayes graphs that +generally describe domains. Next, we generate logical skeletons to guide +language model training, infusing domain knowledge into language models. +Finally, we alternately optimize the searching policy of graphs and language +models until convergence. The experimental results show that Logical-GLM is +both effective and efficient compared with traditional language models, despite +using smaller-scale training data and fewer parameters. Our approach can +generate instructional texts with more correct logic owing to the internalized +domain knowledge. Moreover, the usage of logical graphs reflects the inner +mechanism of the language models, which improves the interpretability of +black-box models. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Memory Sharing for Large Language Model based Agents + + +
+ The adaptation of Large Language Model (LLM)-based agents to execute tasks +via natural language prompts represents a significant advancement, notably +eliminating the need for explicit retraining or fine tuning, but are +constrained by the comprehensiveness and diversity of the provided examples, +leading to outputs that often diverge significantly from expected results, +especially when it comes to the open-ended questions. This paper introduces the +Memory Sharing, a framework which integrates the real-time memory filter, +storage and retrieval to enhance the In-Context Learning process. This +framework allows for the sharing of memories among multiple agents, whereby the +interactions and shared memories between different agents effectively enhance +the diversity of the memories. The collective self-enhancement through +interactive learning among multiple agents facilitates the evolution from +individual intelligence to collective intelligence. Besides, the dynamically +growing memory pool is utilized not only to improve the quality of responses +but also to train and enhance the retriever. We evaluated our framework across +three distinct domains involving specialized tasks of agents. The experimental +results demonstrate that the MS framework significantly improves the agents' +performance in addressing open-ended questions. + +
+
+
+
+
+ + ♻ ☆ Chunk, Align, Select: A Simple Long-sequence Processing Method for + Transformers ACL 2024 + + +
+ Although dominant in natural language processing, transformer-based models +remain challenged by the task of long-sequence processing, because the +computational cost of self-attention operations in transformers swells +quadratically with the input sequence length. To alleviate the complexity of +long-sequence processing, we propose a simple framework to enable the +offthe-shelf pre-trained transformers to process much longer sequences, while +the computation and memory costs remain growing linearly with the input +sequence lengths. More specifically, our method divides each long-sequence +input into a batch of chunks, then aligns the interchunk information during the +encoding steps, and finally selects the most representative hidden states from +the encoder for the decoding process. To extract inter-chunk semantic +information, we align the start and end token embeddings among chunks in each +encoding transformer block. To learn an effective hidden selection policy, we +design a dual updating scheme inspired by reinforcement learning, which regards +the decoders of transformers as environments, and the downstream performance +metrics as the rewards to evaluate the hidden selection actions. Our empirical +results on real-world long-text summarization and reading comprehension tasks +demonstrate effective improvements compared to prior longsequence processing +baselines. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating LLMs' Inherent Multi-hop Reasoning Ability + + +
+ While Large Language Models (LLMs) excel in question-answering (QA) tasks, +their multi-step reasoning abilities on multiple evidence integration on +Multi-hop QA tasks remain underexplored. LLMs sometimes generate answers that +rely on internal memory rather than reasoning given context, which brings +concerns about the evaluation quality of real reasoning abilities. The +counterfactual QA task can separate internal memory from reasoning abilities, +but focusing solely on final-QA performance without evaluating the multi-step +reasoning process is insufficient for reporting LLMs' real reasoning abilities. +Current Multi-hop QA (MHQA) benchmarks are factual and annotated on open-source +corpora such as Wikipedia, although useful for multi-step reasoning evaluation, +showing limitations due to potential data contamination in LLMs pre-training +stage. To address this issue, we introduce the Inherent Reasoning Evaluation +(IRE) method, a novel evaluation way that jointly evaluates the LLMs' +chain-of-reasoning performance based on the first knowledge-edited +counterfactual multi-hop QA data which involves editing the original Wikipedia +passages, reducing data contamination risks. The IRE comprehensively assesses +reasoning chains through sub-QA and final-QA evaluations. Our comparisons +reveal significant performance gaps for several LLMs between Wikipedia-based +benchmarks and IRE, deeming data contamination issues in existing benchmarks. +We believe that the IRE benchmark will enhance and facilitate trustworthy LLM +evaluations. + +
+
+
+
+
+ + ♻ ☆ EasyAnimate: A High-Performance Long Video Generation Method based on + Transformer Architecture + + +
+ This paper presents EasyAnimate, an advanced method for video generation that +leverages the power of transformer architecture for high-performance outcomes. +We have expanded the DiT framework originally designed for 2D image synthesis +to accommodate the complexities of 3D video generation by incorporating a +motion module block. It is used to capture temporal dynamics, thereby ensuring +the production of consistent frames and seamless motion transitions. The motion +module can be adapted to various DiT baseline methods to generate video with +different styles. It can also generate videos with different frame rates and +resolutions during both training and inference phases, suitable for both images +and videos. Moreover, we introduce slice VAE, a novel approach to condense the +temporal axis, facilitating the generation of long duration videos. Currently, +EasyAnimate exhibits the proficiency to generate videos with 144 frames. We +provide a holistic ecosystem for video production based on DiT, encompassing +aspects such as data pre-processing, VAE training, DiT models training (both +the baseline model and LoRA model), and end-to-end video inference. Code is +available at: https://github.com/aigc-apps/EasyAnimate. We are continuously +working to enhance the performance of our method. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks for Text Classification: A Survey + + +
+ Text Classification is the most essential and fundamental problem in Natural +Language Processing. While numerous recent text classification models applied +the sequential deep learning technique, graph neural network-based models can +directly deal with complex structured text data and exploit global information. +Many real text classification applications can be naturally cast into a graph, +which captures words, documents, and corpus global features. In this survey, we +bring the coverage of methods up to 2023, including corpus-level and +document-level graph neural networks. We discuss each of these methods in +detail, dealing with the graph construction mechanisms and the graph-based +learning process. As well as the technological survey, we look at issues behind +and future directions addressed in text classification using graph neural +networks. We also cover datasets, evaluation metrics, and experiment design and +present a summary of published performance on the publicly available +benchmarks. Note that we present a comprehensive comparison between different +techniques and identify the pros and cons of various evaluation metrics in this +survey. + +
+
+ comment: 28 pages, published in Artificial Intelligence Review +
+
+
+
+
+ + ♻ ☆ Should We Fine-Tune or RAG? Evaluating Different Techniques to Adapt + LLMs for Dialogue + + +
+ We study the limitations of Large Language Models (LLMs) for the task of +response generation in human-machine dialogue. Several techniques have been +proposed in the literature for different dialogue types (e.g., Open-Domain). +However, the evaluations of these techniques have been limited in terms of base +LLMs, dialogue types and evaluation metrics. In this work, we extensively +analyze different LLM adaptation techniques when applied to different dialogue +types. We have selected two base LLMs, Llama-2 and Mistral, and four dialogue +types Open-Domain, Knowledge-Grounded, Task-Oriented, and Question Answering. +We evaluate the performance of in-context learning and fine-tuning techniques +across datasets selected for each dialogue type. We assess the impact of +incorporating external knowledge to ground the generation in both scenarios of +Retrieval-Augmented Generation (RAG) and gold knowledge. We adopt consistent +evaluation and explainability criteria for automatic metrics and human +evaluation protocols. Our analysis shows that there is no universal +best-technique for adapting large language models as the efficacy of each +technique depends on both the base LLM and the specific type of dialogue. Last +but not least, the assessment of the best adaptation technique should include +human evaluation to avoid false expectations and outcomes derived from +automatic metrics. + +
+
+
+
+
+ + ♻ ☆ Remember This Event That Year? Assessing Temporal Information and + Reasoning in Large Language Models + + +
+ Large Language Models (LLMs) are increasingly ubiquitous, yet their ability +to retain and reason about temporal information remains limited, hindering +their application in real-world scenarios where understanding the sequential +nature of events is crucial. Our study experiments with 12 state-of-the-art +models (ranging from 2B to 70B+ parameters) on a novel numerical-temporal +dataset, \textbf{TempUN}, spanning from 10,000 BCE to 2100 CE, to uncover +significant temporal retention and comprehension limitations. We propose six +metrics to assess three learning paradigms to enhance temporal knowledge +acquisition. Our findings reveal that open-source models exhibit knowledge gaps +more frequently, suggesting a trade-off between limited knowledge and incorrect +responses. Additionally, various fine-tuning approaches significantly improved +performance, reducing incorrect outputs and impacting the identification of +'information not available' in the generations. The associated dataset and code +are available at (https://github.com/lingoiitgn/TempUN). + +
+
+
+
+
+ + ♻ ☆ Evaluating Quality of Answers for Retrieval-Augmented Generation: A + Strong LLM Is All You Need + + +
+ We present a comprehensive study of answer quality evaluation in +Retrieval-Augmented Generation (RAG) applications using vRAG-Eval, a novel +grading system that is designed to assess correctness, completeness, and +honesty. We further map the grading of quality aspects aforementioned into a +binary score, indicating an accept or reject decision, mirroring the intuitive +"thumbs-up" or "thumbs-down" gesture commonly used in chat applications. This +approach suits factual business settings where a clear decision opinion is +essential. Our assessment applies vRAG-Eval to two Large Language Models +(LLMs), evaluating the quality of answers generated by a vanilla RAG +application. We compare these evaluations with human expert judgments and find +a substantial alignment between GPT-4's assessments and those of human experts, +reaching 83% agreement on accept or reject decisions. This study highlights the +potential of LLMs as reliable evaluators in closed-domain, closed-ended +settings, particularly when human evaluations require significant resources. + +
+
+ comment: 13 pages, 8 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Do Physicians Know How to Prompt? The Need for Automatic Prompt + Optimization Help in Clinical Note Generation + + +
+ This study examines the effect of prompt engineering on the performance of +Large Language Models (LLMs) in clinical note generation. We introduce an +Automatic Prompt Optimization (APO) framework to refine initial prompts and +compare the outputs of medical experts, non-medical experts, and APO-enhanced +GPT3.5 and GPT4. Results highlight GPT4 APO's superior performance in +standardizing prompt quality across clinical note sections. A human-in-the-loop +approach shows that experts maintain content quality post-APO, with a +preference for their own modifications, suggesting the value of expert +customization. We recommend a two-phase optimization process, leveraging +APO-GPT4 for consistency and expert input for personalization. + +
+
+ comment: To appear in BioNLP 2024 +
+
+
+
+
+ + ♻ ☆ Learning Rate Curriculum + + +
+ Most curriculum learning methods require an approach to sort the data samples +by difficulty, which is often cumbersome to perform. In this work, we propose a +novel curriculum learning approach termed Learning Rate Curriculum (LeRaC), +which leverages the use of a different learning rate for each layer of a neural +network to create a data-agnostic curriculum during the initial training +epochs. More specifically, LeRaC assigns higher learning rates to neural layers +closer to the input, gradually decreasing the learning rates as the layers are +placed farther away from the input. The learning rates increase at various +paces during the first training iterations, until they all reach the same +value. From this point on, the neural model is trained as usual. This creates a +model-level curriculum learning strategy that does not require sorting the +examples by difficulty and is compatible with any neural network, generating +higher performance levels regardless of the architecture. We conduct +comprehensive experiments on 12 data sets from the computer vision (CIFAR-10, +CIFAR-100, Tiny ImageNet, ImageNet-200, Food-101, UTKFace, PASCAL VOC), +language (BoolQ, QNLI, RTE) and audio (ESC-50, CREMA-D) domains, considering +various convolutional (ResNet-18, Wide-ResNet-50, DenseNet-121, YOLOv5), +recurrent (LSTM) and transformer (CvT, BERT, SepTr) architectures. We compare +our approach with the conventional training regime, as well as with Curriculum +by Smoothing (CBS), a state-of-the-art data-agnostic curriculum learning +approach. Unlike CBS, our performance improvements over the standard training +regime are consistent across all data sets and models. Furthermore, we +significantly surpass CBS in terms of training time (there is no additional +cost over the standard training regime for LeRaC). Our code is freely available +at: https://github.com/CroitoruAlin/LeRaC. + +
+
+ comment: Accepted at the International Journal of Computer Vision +
+
+
+
+
+ + ♻ ☆ Representation Surgery: Theory and Practice of Affine Steering ICML 2024 + + +
+ Language models often exhibit undesirable behavior, e.g., generating toxic or +gender-biased text. In the case of neural language models, an encoding of the +undesirable behavior is often present in the model's representations. Thus, one +natural (and common) approach to prevent the model from exhibiting undesirable +behavior is to steer the model's representations in a manner that reduces the +probability of it generating undesirable text. This paper investigates the +formal and empirical properties of steering functions, i.e., transformation of +the neural language model's representations that alter its behavior. First, we +derive two optimal, in the least-squares sense, affine steering functions under +different constraints. Our theory provides justification for existing +approaches and offers a novel, improved steering approach. Second, we offer a +series of experiments that demonstrate the empirical effectiveness of the +methods in mitigating bias and reducing toxic generation. + +
+
+ comment: Accepted in ICML 2024 +
+
+
+
+
+ + ♻ ☆ Temporal Knowledge Graph Question Answering: A Survey + + +
+ Knowledge Base Question Answering (KBQA) has been a long-standing field to +answer questions based on knowledge bases. Recently, the evolving dynamics of +knowledge have attracted a growing interest in Temporal Knowledge Graph +Question Answering (TKGQA), an emerging task to answer temporal questions. +However, this field grapples with ambiguities in defining temporal questions +and lacks a systematic categorization of existing methods for TKGQA. In +response, this paper provides a thorough survey from two perspectives: the +taxonomy of temporal questions and the methodological categorization for TKGQA. +Specifically, we first establish a detailed taxonomy of temporal questions +engaged in prior studies. Subsequently, we provide a comprehensive review of +TKGQA techniques of two categories: semantic parsing-based and TKG +embedding-based. Building on this review, the paper outlines potential research +directions aimed at advancing the field of TKGQA. This work aims to serve as a +comprehensive reference for TKGQA and to stimulate further research. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Transformer-Lite: High-efficiency Deployment of Large Language Models on + Mobile Phone GPUs + + +
+ The Large Language Model (LLM) is widely employed for tasks such as +intelligent assistants, text summarization, translation, and multi-modality on +mobile phones. However, the current methods for on-device LLM deployment +maintain slow inference speed, which causes poor user experience. To facilitate +high-efficiency LLM deployment on device GPUs, we propose four optimization +techniques: (a) a symbolic expression-based approach to support dynamic shape +model inference; (b) operator optimizations and execution priority setting to +enhance inference speed and reduce phone lagging; (c) an FP4 quantization +method termed M0E4 to reduce dequantization overhead; (d) a sub-tensor-based +technique to eliminate the need for copying KV cache after LLM inference. +Furthermore, we implement these methods in our mobile inference engine, +Transformer-Lite, which is compatible with both Qualcomm and MTK processors. We +evaluated Transformer-Lite's performance using LLMs with varied architectures +and parameters ranging from 2B to 14B. Specifically, we achieved prefill and +decoding speeds of 121 token/s and 14 token/s for ChatGLM2 6B, and 330 token/s +and 30 token/s for smaller Gemma 2B, respectively. Compared with CPU-based +FastLLM and GPU-based MLC-LLM, our engine attains over 10x speedup for the +prefill speed and 2~3x speedup for the decoding speed. + +
+
+ comment: 21 pages, 6 figures, fix "E0M4" spell mistake, fix FLOPS to TFLOPS +
+
+
+
+
+ + ♻ ☆ SEACrowd: A Multilingual Multimodal Data Hub and Benchmark Suite for + Southeast Asian Languages + + +
+ Southeast Asia (SEA) is a region rich in linguistic diversity and cultural +variety, with over 1,300 indigenous languages and a population of 671 million +people. However, prevailing AI models suffer from a significant lack of +representation of texts, images, and audio datasets from SEA, compromising the +quality of AI models for SEA languages. Evaluating models for SEA languages is +challenging due to the scarcity of high-quality datasets, compounded by the +dominance of English training data, raising concerns about potential cultural +misrepresentation. To address these challenges, we introduce SEACrowd, a +collaborative initiative that consolidates a comprehensive resource hub that +fills the resource gap by providing standardized corpora in nearly 1,000 SEA +languages across three modalities. Through our SEACrowd benchmarks, we assess +the quality of AI models on 36 indigenous languages across 13 tasks, offering +valuable insights into the current AI landscape in SEA. Furthermore, we propose +strategies to facilitate greater AI advancements, maximizing potential utility +and resource equity for the future of AI in SEA. + +
+
+ comment: https://github.com/SEACrowd +
+
+
+
+
+ + ♻ ☆ Connecting the Dots: Evaluating Abstract Reasoning Capabilities of LLMs + Using the New York Times Connections Word Game + + +
+ The New York Times Connections game has emerged as a popular and challenging +pursuit for word puzzle enthusiasts. We collect 200 Connections games to +evaluate the performance of state-of-the-art large language models (LLMs) +against expert and novice human players. Our results show that even the +best-performing LLM, GPT-4o, which has otherwise shown impressive reasoning +abilities on a wide variety of benchmarks, can only fully solve 8% of the +games. Compared to GPT-4o, novice and expert players perform better, with +expert human players significantly outperforming GPT-4o. To deepen our +understanding we create a taxonomy of the knowledge types required to +successfully categorize words in the Connections game, revealing that LLMs +struggle with associative, encyclopedic, and linguistic knowledge. Our findings +establish the New York Times Connections game as a challenging benchmark for +evaluating abstract reasoning capabilities in humans and AI systems. + +
+
+
+
+
+ + ♻ ☆ RAM: Towards an Ever-Improving Memory System by Learning from + Communications + + +
+ We introduce an innovative RAG-based framework with an ever-improving memory. +Inspired by humans'pedagogical process, RAM utilizes recursively +reasoning-based retrieval and experience reflections to continually update the +memory and learn from users' communicative feedback, namely communicative +learning. Extensive experiments with both simulated and real users demonstrate +significant improvements over traditional RAG and self-knowledge methods, +particularly excelling in handling false premise and multi-hop questions. +Furthermore, RAM exhibits promising adaptability to various feedback and +retrieval methods, showcasing its potential for advancing AI capabilities in +dynamic knowledge acquisition and lifelong learning. + +
+
+
+
+
+ + ♻ ☆ Towards Multimodal Sentiment Analysis Debiasing via Bias Purification ECCV 2024 + + +
+ Multimodal Sentiment Analysis (MSA) aims to understand human intentions by +integrating emotion-related clues from diverse modalities, such as visual, +language, and audio. Unfortunately, the current MSA task invariably suffers +from unplanned dataset biases, particularly multimodal utterance-level label +bias and word-level context bias. These harmful biases potentially mislead +models to focus on statistical shortcuts and spurious correlations, causing +severe performance bottlenecks. To alleviate these issues, we present a +Multimodal Counterfactual Inference Sentiment (MCIS) analysis framework based +on causality rather than conventional likelihood. Concretely, we first +formulate a causal graph to discover harmful biases from already-trained +vanilla models. In the inference phase, given a factual multimodal input, MCIS +imagines two counterfactual scenarios to purify and mitigate these biases. +Then, MCIS can make unbiased decisions from biased observations by comparing +factual and counterfactual outcomes. We conduct extensive experiments on +several standard MSA benchmarks. Qualitative and quantitative results show the +effectiveness of the proposed framework. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning for + Sparse Architectural Large Language Models + + +
+ Parameter-efficient fine-tuning (PEFT) is crucial for customizing Large +Language Models (LLMs) with constrained resources. Although there have been +various PEFT methods for dense-architecture LLMs, PEFT for sparse-architecture +LLMs is still underexplored. In this work, we study the PEFT method for LLMs +with the Mixture-of-Experts (MoE) architecture and the contents of this work +are mainly threefold: (1) We investigate the dispersion degree of the activated +experts in customized tasks, and found that the routing distribution for a +specific task tends to be highly concentrated, while the distribution of +activated experts varies significantly across different tasks. (2) We propose +Expert-Specialized Fine-Tuning, or ESFT, which tunes the experts most relevant +to downstream tasks while freezing the other experts and modules; experimental +results demonstrate that our method not only improves the tuning efficiency, +but also matches or even surpasses the performance of full-parameter +fine-tuning. (3) We further analyze the impact of the MoE architecture on +expert-specialized fine-tuning. We find that MoE models with finer-grained +experts are more advantageous in selecting the combination of experts that are +most relevant to downstream tasks, thereby enhancing both the training +efficiency and effectiveness. Our code is available at +https://github.com/deepseek-ai/ESFT. + +
+
+
+
+
+ + ♻ ☆ Language-Guided World Models: A Model-Based Approach to AI Control ACL 2024 + + +
+ This paper introduces the concept of Language-Guided World Models (LWMs) -- +probabilistic models that can simulate environments by reading texts. Agents +equipped with these models provide humans with more extensive and efficient +control, allowing them to simultaneously alter agent behaviors in multiple +tasks via natural verbal communication. In this work, we take initial steps in +developing robust LWMs that can generalize to compositionally novel language +descriptions. We design a challenging world modeling benchmark based on the +game of MESSENGER (Hanjie et al., 2021), featuring evaluation settings that +require varying degrees of compositional generalization. Our experiments reveal +the lack of generalizability of the state-of-the-art Transformer model, as it +offers marginal improvements in simulation quality over a no-text baseline. We +devise a more robust model by fusing the Transformer with the EMMA attention +mechanism (Hanjie et al., 2021). Our model substantially outperforms the +Transformer and approaches the performance of a model with an oracle semantic +parsing and grounding capability. To demonstrate the practicality of this model +in improving AI safety and transparency, we simulate a scenario in which the +model enables an agent to present plans to a human before execution, and to +revise plans based on their language feedback. + +
+
+ comment: SpLU-RoboNLP workshop at ACL 2024 +
+
+
+
+
+ + ♻ ☆ Large Language Models for Cuffless Blood Pressure Measurement From + Wearable Biosignals + + +
+ Large language models (LLMs) have captured significant interest from both +academia and industry due to their impressive performance across various +textual tasks. However, the potential of LLMs to analyze physiological +time-series data remains an emerging research field. Particularly, there is a +notable gap in the utilization of LLMs for analyzing wearable biosignals to +achieve cuffless blood pressure (BP) measurement, which is critical for the +management of cardiovascular diseases. This paper presents the first work to +explore the capacity of LLMs to perform cuffless BP estimation based on +wearable biosignals. We extracted physiological features from electrocardiogram +(ECG) and photoplethysmogram (PPG) signals and designed context-enhanced +prompts by combining these features with BP domain knowledge and user +information. Subsequently, we adapted LLMs to BP estimation tasks through +fine-tuning. To evaluate the proposed approach, we conducted assessments of ten +advanced LLMs using a comprehensive public dataset of wearable biosignals from +1,272 participants. The experimental results demonstrate that the optimally +fine-tuned LLM significantly surpasses conventional task-specific baselines, +achieving an estimation error of 0.00 $\pm$ 9.25 mmHg for systolic BP and 1.29 +$\pm$ 6.37 mmHg for diastolic BP. Notably, the ablation studies highlight the +benefits of our context enhancement strategy, leading to an 8.9% reduction in +mean absolute error for systolic BP estimation. This paper pioneers the +exploration of LLMs for cuffless BP measurement, providing a potential solution +to enhance the accuracy of cuffless BP measurement. + +
+
+
+
+
+ + ♻ ☆ DELL: Generating Reactions and Explanations for LLM-Based Misinformation + Detection + + +
+ Large language models are limited by challenges in factuality and +hallucinations to be directly employed off-the-shelf for judging the veracity +of news articles, where factual accuracy is paramount. In this work, we propose +DELL that identifies three key stages in misinformation detection where LLMs +could be incorporated as part of the pipeline: 1) LLMs could \emph{generate +news reactions} to represent diverse perspectives and simulate user-news +interaction networks; 2) LLMs could \emph{generate explanations} for proxy +tasks (e.g., sentiment, stance) to enrich the contexts of news articles and +produce experts specializing in various aspects of news understanding; 3) LLMs +could \emph{merge task-specific experts} and provide an overall prediction by +incorporating the predictions and confidence scores of varying experts. +Extensive experiments on seven datasets with three LLMs demonstrate that DELL +outperforms state-of-the-art baselines by up to 16.8\% in macro f1-score. +Further analysis reveals that the generated reactions and explanations are +greatly helpful in misinformation detection, while our proposed LLM-guided +expert merging helps produce better-calibrated predictions. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 123 + +
+
+
+ + ☆ LaRa: Efficient Large-Baseline Radiance Fields + + +
+ Radiance field methods have achieved photorealistic novel view synthesis and +geometry reconstruction. But they are mostly applied in per-scene optimization +or small-baseline settings. While several recent works investigate feed-forward +reconstruction with large baselines by utilizing transformers, they all operate +with a standard global attention mechanism and hence ignore the local nature of +3D reconstruction. We propose a method that unifies local and global reasoning +in transformer layers, resulting in improved quality and faster convergence. +Our model represents scenes as Gaussian Volumes and combines this with an image +encoder and Group Attention Layers for efficient feed-forward reconstruction. +Experimental results demonstrate that our model, trained for two days on four +GPUs, demonstrates high fidelity in reconstructing 360° radiance fields, and +robustness to zero-shot and out-of-domain testing. + +
+
+
+
+
+ + ☆ VCoME: Verbal Video Composition with Multimodal Editing Effects + + +
+ Verbal videos, featuring voice-overs or text overlays, provide valuable +content but present significant challenges in composition, especially when +incorporating editing effects to enhance clarity and visual appeal. In this +paper, we introduce the novel task of verbal video composition with editing +effects. This task aims to generate coherent and visually appealing verbal +videos by integrating multimodal editing effects across textual, visual, and +audio categories. To achieve this, we curate a large-scale dataset of video +effects compositions from publicly available sources. We then formulate this +task as a generative problem, involving the identification of appropriate +positions in the verbal content and the recommendation of editing effects for +these positions. To address this task, we propose VCoME, a general framework +that employs a large multimodal model to generate editing effects for video +composition. Specifically, VCoME takes in the multimodal video context and +autoregressively outputs where to apply effects within the verbal content and +which effects are most appropriate for each position. VCoME also supports +prompt-based control of composition density and style, providing substantial +flexibility for diverse applications. Through extensive quantitative and +qualitative evaluations, we clearly demonstrate the effectiveness of VCoME. A +comprehensive user study shows that our method produces videos of professional +quality while being 85$\times$ more efficient than professional editors. + +
+
+
+
+
+ + ☆ RAM: Retrieval-Based Affordance Transfer for Generalizable Zero-Shot + Robotic Manipulation + + +
+ This work proposes a retrieve-and-transfer framework for zero-shot robotic +manipulation, dubbed RAM, featuring generalizability across various objects, +environments, and embodiments. Unlike existing approaches that learn +manipulation from expensive in-domain demonstrations, RAM capitalizes on a +retrieval-based affordance transfer paradigm to acquire versatile manipulation +capabilities from abundant out-of-domain data. First, RAM extracts unified +affordance at scale from diverse sources of demonstrations including robotic +data, human-object interaction (HOI) data, and custom data to construct a +comprehensive affordance memory. Then given a language instruction, RAM +hierarchically retrieves the most similar demonstration from the affordance +memory and transfers such out-of-domain 2D affordance to in-domain 3D +executable affordance in a zero-shot and embodiment-agnostic manner. Extensive +simulation and real-world evaluations demonstrate that our RAM consistently +outperforms existing works in diverse daily tasks. Additionally, RAM shows +significant potential for downstream applications such as automatic and +efficient data collection, one-shot visual imitation, and LLM/VLM-integrated +long-horizon manipulation. For more details, please check our website at +https://yxkryptonite.github.io/RAM/. + +
+
+
+
+
+ + ☆ Enhancing Vehicle Re-identification and Matching for Weaving Analysis + + +
+ Vehicle weaving on highways contributes to traffic congestion, raises safety +issues, and underscores the need for sophisticated traffic management systems. +Current tools are inadequate in offering precise and comprehensive data on +lane-specific weaving patterns. This paper introduces an innovative method for +collecting non-overlapping video data in weaving zones, enabling the generation +of quantitative insights into lane-specific weaving behaviors. Our experimental +results confirm the efficacy of this approach, delivering critical data that +can assist transportation authorities in enhancing traffic control and roadway +infrastructure. + +
+
+
+
+
+ + ☆ Embracing Massive Medical Data MICCAI 2024 + + +
+ As massive medical data become available with an increasing number of scans, +expanding classes, and varying sources, prevalent training paradigms -- where +AI is trained with multiple passes over fixed, finite datasets -- face +significant challenges. First, training AI all at once on such massive data is +impractical as new scans/sources/classes continuously arrive. Second, training +AI continuously on new scans/sources/classes can lead to catastrophic +forgetting, where AI forgets old data as it learns new data, and vice versa. To +address these two challenges, we propose an online learning method that enables +training AI from massive medical data. Instead of repeatedly training AI on +randomly selected data samples, our method identifies the most significant +samples for the current AI model based on their data uniqueness and prediction +uncertainty, then trains the AI on these selective data samples. Compared with +prevalent training paradigms, our method not only improves data efficiency by +enabling training on continual data streams, but also mitigates catastrophic +forgetting by selectively training AI on significant data samples that might +otherwise be forgotten, outperforming by 15% in Dice score for multi-organ and +tumor segmentation. + The code is available at https://github.com/MrGiovanni/OnlineLearning + +
+
+ comment: Accepted to MICCAI 2024 +
+
+
+
+
+ + ☆ Efficient Betti Matching Enables Topology-Aware 3D Segmentation via + Persistent Homology + + +
+ In this work, we propose an efficient algorithm for the calculation of the +Betti matching, which can be used as a loss function to train topology aware +segmentation networks. Betti matching loss builds on techniques from +topological data analysis, specifically persistent homology. A major challenge +is the computational cost of computing persistence barcodes. In response to +this challenge, we propose a new, highly optimized implementation of Betti +matching, implemented in C++ together with a python interface, which achieves +significant speedups compared to the state-of-the-art implementation Cubical +Ripser. We use Betti matching 3D to train segmentation networks with the Betti +matching loss and demonstrate improved topological correctness of predicted +segmentations across several datasets. The source code is available at +https://github.com/nstucki/Betti-Matching-3D. + +
+
+
+
+
+ + ☆ Rethinking Visual Prompting for Multimodal Large Language Models with + External Knowledge + + +
+ In recent years, multimodal large language models (MLLMs) have made +significant strides by training on vast high-quality image-text datasets, +enabling them to generally understand images well. However, the inherent +difficulty in explicitly conveying fine-grained or spatially dense information +in text, such as masks, poses a challenge for MLLMs, limiting their ability to +answer questions requiring an understanding of detailed or localized visual +elements. Drawing inspiration from the Retrieval-Augmented Generation (RAG) +concept, this paper proposes a new visual prompt approach to integrate +fine-grained external knowledge, gleaned from specialized vision models (e.g., +instance segmentation/OCR models), into MLLMs. This is a promising yet +underexplored direction for enhancing MLLMs' performance. Our approach diverges +from concurrent works, which transform external knowledge into additional text +prompts, necessitating the model to indirectly learn the correspondence between +visual content and text coordinates. Instead, we propose embedding fine-grained +knowledge information directly into a spatial embedding map as a visual prompt. +This design can be effortlessly incorporated into various MLLMs, such as LLaVA +and Mipha, considerably improving their visual understanding performance. +Through rigorous experiments, we demonstrate that our method can enhance MLLM +performance across nine benchmarks, amplifying their fine-grained context-aware +capabilities. + +
+
+
+
+
+ + ☆ Is plantar thermography a valid digital biomarker for characterising + diabetic foot ulceration risk? + + +
+ Background: In the absence of prospective data on diabetic foot ulcers (DFU), +cross-sectional associations with causal risk factors (peripheral neuropathy, +and peripheral arterial disease (PAD)) could be used to establish the validity +of plantar thermography for DFU risk stratification. + Methods: First, we investigated the associations between the intrinsic +clusters of plantar thermographic images with several DFU risk factors using an +unsupervised deep-learning framework. We then studied associations between +obtained thermography clusters and DFU risk factors. Second, to identify those +associations with predictive power, we used supervised learning to train +Convolutional Neural Network (CNN) regression/classification models that +predicted the risk factor based on the thermograph (and visual) input. + Findings: Our dataset comprised 282 thermographs from type 2 diabetes +mellitus patients (aged 56.31 +- 9.18 years, 51.42 % males). On clustering, we +found two overlapping clusters (silhouette score = 0.10, indicating weak +separation). There was strong evidence for associations between assigned +clusters and several factors related to diabetic foot ulceration such as +peripheral neuropathy, PAD, number of diabetes complications, and composite DFU +risk prediction scores such as Martins-Mendes, PODUS-2020, and SIGN. However, +models predicting said risk factors had poor performances. + Interpretation: The strong associations between intrinsic thermography +clusters and several DFU risk factors support the validity of using +thermography for characterising DFU risk. However, obtained associations did +not prove to be predictive, likely due to, spectrum bias, or because +thermography and classical risk factors characterise incompletely overlapping +portions of the DFU risk construct. Our findings highlight the challenges in +standardising ground truths when defining novel digital biomarkers. + +
+
+ comment: 13 pages, 2 Figures, 1 Table. Supplementary files and link to code to + be uploaded +
+
+
+
+
+ + ☆ Unsupervised 4D Cardiac Motion Tracking with Spatiotemporal Optical Flow + Networks + + +
+ Cardiac motion tracking from echocardiography can be used to estimate and +quantify myocardial motion within a cardiac cycle. It is a cost-efficient and +effective approach for assessing myocardial function. However, ultrasound +imaging has the inherent characteristics of spatially low resolution and +temporally random noise, which leads to difficulties in obtaining reliable +annotation. Thus it is difficult to perform supervised learning for motion +tracking. In addition, there is no end-to-end unsupervised method currently in +the literature. This paper presents a motion tracking method where unsupervised +optical flow networks are designed with spatial reconstruction loss and +temporal-consistency loss. Our proposed loss functions make use of the +pair-wise and temporal correlation to estimate cardiac motion from noisy +background. Experiments using a synthetic 4D echocardiography dataset has shown +the effectiveness of our approach, and its superiority over existing methods on +both accuracy and running speed. To the best of our knowledge, this is the +first work performed that uses unsupervised end-to-end deep learning optical +flow network for 4D cardiac motion tracking. + +
+
+
+
+
+ + ☆ SAM Fewshot Finetuning for Anatomical Segmentation in Medical Images + + +
+ We propose a straightforward yet highly effective few-shot fine-tuning +strategy for adapting the Segment Anything (SAM) to anatomical segmentation +tasks in medical images. Our novel approach revolves around reformulating the +mask decoder within SAM, leveraging few-shot embeddings derived from a limited +set of labeled images (few-shot collection) as prompts for querying anatomical +objects captured in image embeddings. This innovative reformulation greatly +reduces the need for time-consuming online user interactions for labeling +volumetric images, such as exhaustively marking points and bounding boxes to +provide prompts slice by slice. With our method, users can manually segment a +few 2D slices offline, and the embeddings of these annotated image regions +serve as effective prompts for online segmentation tasks. Our method +prioritizes the efficiency of the fine-tuning process by exclusively training +the mask decoder through caching mechanisms while keeping the image encoder +frozen. Importantly, this approach is not limited to volumetric medical images, +but can generically be applied to any 2D/3D segmentation task. To thoroughly +evaluate our method, we conducted extensive validation on four datasets, +covering six anatomical segmentation tasks across two modalities. Furthermore, +we conducted a comparative analysis of different prompting options within SAM +and the fully-supervised nnU-Net. The results demonstrate the superior +performance of our method compared to SAM employing only point prompts +(approximately 50% improvement in IoU) and performs on-par with fully +supervised methods whilst reducing the requirement of labeled data by at least +an order of magnitude. + +
+
+ comment: 9 pages, Proceedings of the IEEE/CVF Winter Conference on + Applications of Computer Vision. 2024 +
+
+
+
+
+ + ☆ Semi-Supervised Segmentation via Embedding Matching + + +
+ Deep convolutional neural networks are widely used in medical image +segmentation but require many labeled images for training. Annotating +three-dimensional medical images is a time-consuming and costly process. To +overcome this limitation, we propose a novel semi-supervised segmentation +method that leverages mostly unlabeled images and a small set of labeled images +in training. Our approach involves assessing prediction uncertainty to identify +reliable predictions on unlabeled voxels from the teacher model. These voxels +serve as pseudo-labels for training the student model. In voxels where the +teacher model produces unreliable predictions, pseudo-labeling is carried out +based on voxel-wise embedding correspondence using reference voxels from +labeled images. We applied this method to automate hip bone segmentation in CT +images, achieving notable results with just 4 CT scans. The proposed approach +yielded a Hausdorff distance with 95th percentile (HD95) of 3.30 and IoU of +0.929, surpassing existing methods achieving HD95 (4.07) and IoU (0.927) at +their best. + +
+
+ comment: 13 pages, MIDL2024 oral +
+
+
+
+
+ + ☆ OneRestore: A Universal Restoration Framework for Composite Degradation + + +
+ In real-world scenarios, image impairments often manifest as composite +degradations, presenting a complex interplay of elements such as low light, +haze, rain, and snow. Despite this reality, existing restoration methods +typically target isolated degradation types, thereby falling short in +environments where multiple degrading factors coexist. To bridge this gap, our +study proposes a versatile imaging model that consolidates four physical +corruption paradigms to accurately represent complex, composite degradation +scenarios. In this context, we propose OneRestore, a novel transformer-based +framework designed for adaptive, controllable scene restoration. The proposed +framework leverages a unique cross-attention mechanism, merging degraded scene +descriptors with image features, allowing for nuanced restoration. Our model +allows versatile input scene descriptors, ranging from manual text embeddings +to automatic extractions based on visual attributes. Our methodology is further +enhanced through a composite degradation restoration loss, using extra degraded +images as negative samples to fortify model constraints. Comparative results on +synthetic and real-world datasets demonstrate OneRestore as a superior +solution, significantly advancing the state-of-the-art in addressing complex, +composite degradations. + +
+
+
+
+
+ + ☆ CountGD: Multi-Modal Open-World Counting + + +
+ The goal of this paper is to improve the generality and accuracy of +open-vocabulary object counting in images. To improve the generality, we +repurpose an open-vocabulary detection foundation model (GroundingDINO) for the +counting task, and also extend its capabilities by introducing modules to +enable specifying the target object to count by visual exemplars. In turn, +these new capabilities - being able to specify the target object by +multi-modalites (text and exemplars) - lead to an improvement in counting +accuracy. + We make three contributions: First, we introduce the first open-world +counting model, CountGD, where the prompt can be specified by a text +description or visual exemplars or both; Second, we show that the performance +of the model significantly improves the state of the art on multiple counting +benchmarks - when using text only, CountGD is comparable to or outperforms all +previous text-only works, and when using both text and visual exemplars, we +outperform all previous models; Third, we carry out a preliminary study into +different interactions between the text and visual exemplar prompts, including +the cases where they reinforce each other and where one restricts the other. +The code and an app to test the model are available at +https://www.robots.ox.ac.uk/~vgg/research/countgd/. + +
+
+
+
+
+ + ☆ Isomorphic Pruning for Vision Models + + +
+ Structured pruning reduces the computational overhead of deep neural networks +by removing redundant sub-structures. However, assessing the relative +importance of different sub-structures remains a significant challenge, +particularly in advanced vision models featuring novel mechanisms and +architectures like self-attention, depth-wise convolutions, or residual +connections. These heterogeneous substructures usually exhibit diverged +parameter scales, weight distributions, and computational topology, introducing +considerable difficulty to importance comparison. To overcome this, we present +Isomorphic Pruning, a simple approach that demonstrates effectiveness across a +range of network architectures such as Vision Transformers and CNNs, and +delivers competitive performance across different model sizes. Isomorphic +Pruning originates from an observation that, when evaluated under a pre-defined +importance criterion, heterogeneous sub-structures demonstrate significant +divergence in their importance distribution, as opposed to isomorphic +structures that present similar importance patterns. This inspires us to +perform isolated ranking and comparison on different types of sub-structures +for more reliable pruning. Our empirical results on ImageNet-1K demonstrate +that Isomorphic Pruning surpasses several pruning baselines dedicatedly +designed for Transformers or CNNs. For instance, we improve the accuracy of +DeiT-Tiny from 74.52% to 77.50% by pruning an off-the-shelf DeiT-Base model. +And for ConvNext-Tiny, we enhanced performance from 82.06% to 82.18%, while +reducing the number of parameters and memory usage. Code is available at +\url{https://github.com/VainF/Isomorphic-Pruning}. + +
+
+
+
+
+ + ☆ PartCraft: Crafting Creative Objects by Parts ECCV 2024 + + +
+ This paper propels creative control in generative visual AI by allowing users +to "select". Departing from traditional text or sketch-based methods, we for +the first time allow users to choose visual concepts by parts for their +creative endeavors. The outcome is fine-grained generation that precisely +captures selected visual concepts, ensuring a holistically faithful and +plausible result. To achieve this, we first parse objects into parts through +unsupervised feature clustering. Then, we encode parts into text tokens and +introduce an entropy-based normalized attention loss that operates on them. +This loss design enables our model to learn generic prior topology knowledge +about object's part composition, and further generalize to novel part +compositions to ensure the generation looks holistically faithful. Lastly, we +employ a bottleneck encoder to project the part tokens. This not only enhances +fidelity but also accelerates learning, by leveraging shared knowledge and +facilitating information exchange among instances. Visual results in the paper +and supplementary material showcase the compelling power of PartCraft in +crafting highly customized, innovative creations, exemplified by the "charming" +and creative birds. Code is released at https://github.com/kamwoh/partcraft. + +
+
+ comment: ECCV 2024. arXiv admin note: substantial text overlap with + arXiv:2311.15477 +
+
+
+
+
+ + ☆ AWT: Transferring Vision-Language Models via Augmentation, Weighting, + and Transportation + + +
+ Pre-trained vision-language models (VLMs) have shown impressive results in +various visual classification tasks. However, we often fail to fully unleash +their potential when adapting them for new concept understanding due to limited +information on new classes. To address this limitation, we introduce a novel +adaptation framework, AWT (Augment, Weight, then Transport). AWT comprises +three key components: augmenting inputs with diverse visual perspectives and +enriched class descriptions through image transformations and language models; +dynamically weighting inputs based on the prediction entropy; and employing +optimal transport to mine semantic correlations in the vision-language space. +AWT can be seamlessly integrated into various VLMs, enhancing their zero-shot +capabilities without additional training and facilitating few-shot learning +through an integrated multimodal adapter module. We verify AWT in multiple +challenging scenarios, including zero-shot and few-shot image classification, +zero-shot video action recognition, and out-of-distribution generalization. AWT +consistently outperforms the state-of-the-art methods in each setting. In +addition, our extensive studies further demonstrate AWT's effectiveness and +adaptability across different VLMs, architectures, and scales. + +
+
+
+
+
+ + ☆ Feature Attenuation of Defective Representation Can Resolve Incomplete + Masking on Anomaly Detection + + +
+ In unsupervised anomaly detection (UAD) research, while state-of-the-art +models have reached a saturation point with extensive studies on public +benchmark datasets, they adopt large-scale tailor-made neural networks (NN) for +detection performance or pursued unified models for various tasks. Towards edge +computing, it is necessary to develop a computationally efficient and scalable +solution that avoids large-scale complex NNs. Motivated by this, we aim to +optimize the UAD performance with minimal changes to NN settings. Thus, we +revisit the reconstruction-by-inpainting approach and rethink to improve it by +analyzing strengths and weaknesses. The strength of the SOTA methods is a +single deterministic masking approach that addresses the challenges of random +multiple masking that is inference latency and output inconsistency. +Nevertheless, the issue of failure to provide a mask to completely cover +anomalous regions is a remaining weakness. To mitigate this issue, we propose +Feature Attenuation of Defective Representation (FADeR) that only employs two +MLP layers which attenuates feature information of anomaly reconstruction +during decoding. By leveraging FADeR, features of unseen anomaly patterns are +reconstructed into seen normal patterns, reducing false alarms. Experimental +results demonstrate that FADeR achieves enhanced performance compared to +similar-scale NNs. Furthermore, our approach exhibits scalability in +performance enhancement when integrated with other single deterministic masking +methods in a plug-and-play manner. + +
+
+ comment: 11 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ Smell and Emotion: Recognising emotions in smell-related artworks + + +
+ Emotions and smell are underrepresented in digital art history. In this +exploratory work, we show that recognising emotions from smell-related artworks +is technically feasible but has room for improvement. Using style transfer and +hyperparameter optimization we achieve a minor performance boost and open up +the field for future extensions. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ SH17: A Dataset for Human Safety and Personal Protective Equipment + Detection in Manufacturing Industry + + +
+ Workplace accidents continue to pose significant risks for human safety, +particularly in industries such as construction and manufacturing, and the +necessity for effective Personal Protective Equipment (PPE) compliance has +become increasingly paramount. Our research focuses on the development of +non-invasive techniques based on the Object Detection (OD) and Convolutional +Neural Network (CNN) to detect and verify the proper use of various types of +PPE such as helmets, safety glasses, masks, and protective clothing. This study +proposes the SH17 Dataset, consisting of 8,099 annotated images containing +75,994 instances of 17 classes collected from diverse industrial environments, +to train and validate the OD models. We have trained state-of-the-art OD models +for benchmarking, and initial results demonstrate promising accuracy levels +with You Only Look Once (YOLO)v9-e model variant exceeding 70.9% in PPE +detection. The performance of the model validation on cross-domain datasets +suggests that integrating these technologies can significantly improve safety +management systems, providing a scalable and efficient solution for industries +striving to meet human safety regulations and protect their workforce. The +dataset is available at https://github.com/ahmadmughees/sh17dataset. + +
+
+
+
+
+ + ☆ Multimodal Classification via Modal-Aware Interactive Enhancement + + +
+ Due to the notorious modality imbalance problem, multimodal learning (MML) +leads to the phenomenon of optimization imbalance, thus struggling to achieve +satisfactory performance. Recently, some representative methods have been +proposed to boost the performance, mainly focusing on adaptive adjusting the +optimization of each modality to rebalance the learning speed of dominant and +non-dominant modalities. To better facilitate the interaction of model +information in multimodal learning, in this paper, we propose a novel +multimodal learning method, called modal-aware interactive enhancement (MIE). +Specifically, we first utilize an optimization strategy based on sharpness +aware minimization (SAM) to smooth the learning objective during the forward +phase. Then, with the help of the geometry property of SAM, we propose a +gradient modification strategy to impose the influence between different +modalities during the backward phase. Therefore, we can improve the +generalization ability and alleviate the modality forgetting phenomenon +simultaneously for multimodal learning. Extensive experiments on widely used +datasets demonstrate that our proposed method can outperform various +state-of-the-art baselines to achieve the best performance. + +
+
+
+
+
+ + ☆ Real Time Emotion Analysis Using Deep Learning for Education, + Entertainment, and Beyond + + +
+ The significance of emotion detection is increasing in education, +entertainment, and various other domains. We are developing a system that can +identify and transform facial expressions into emojis to provide immediate +feedback.The project consists of two components. Initially, we will employ +sophisticated image processing techniques and neural networks to construct a +deep learning model capable of precisely categorising facial expressions. Next, +we will develop a basic application that records live video using the camera on +your device. The app will utilise a sophisticated model to promptly analyse +facial expressions and promptly exhibit corresponding emojis.Our objective is +to develop a dynamic tool that integrates deep learning and real-time video +processing for the purposes of online education, virtual events, gaming, and +enhancing user experience. This tool enhances interactions and introduces novel +emotional intelligence technologies. + +
+
+ comment: 8 pages, 23 figures +
+
+
+
+
+ + ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More + than Measuring Coherence, Grounding, and Repetition + + +
+ Visual storytelling consists in generating a natural language story given a +temporally ordered sequence of images. This task is not only challenging for +models, but also very difficult to evaluate with automatic metrics since there +is no consensus about what makes a story 'good'. In this paper, we introduce a +novel method that measures story quality in terms of human likeness regarding +three key aspects highlighted in previous work: visual grounding, coherence, +and repetitiveness. We then use this method to evaluate the stories generated +by several models, showing that the foundation model LLaVA obtains the best +result, but only slightly so compared to TAPM, a 50-times smaller visual +storytelling model. Upgrading the visual and language components of TAPM +results in a model that yields competitive performance with a relatively low +number of parameters. Finally, we carry out a human evaluation study, whose +results suggest that a 'good' story may require more than a human-like level of +visual grounding, coherence, and repetition. + +
+
+
+
+
+ + ☆ Gaussian Eigen Models for Human Heads + + +
+ We present personalized Gaussian Eigen Models (GEMs) for human heads, a novel +method that compresses dynamic 3D Gaussians into low-dimensional linear spaces. +Our approach is inspired by the seminal work of Blanz and Vetter, where a +mesh-based 3D morphable model (3DMM) is constructed from registered meshes. +Based on dynamic 3D Gaussians, we create a lower-dimensional representation of +primitives that applies to most 3DGS head avatars. Specifically, we propose a +universal method to distill the appearance of a mesh-controlled UNet Gaussian +avatar using an ensemble of linear eigenbasis. We replace heavy CNN-based +architectures with a single linear layer improving speed and enabling a range +of real-time downstream applications. To create a particular facial expression, +one simply needs to perform a dot product between the eigen coefficients and +the distilled basis. This efficient method removes the requirement for an input +mesh during testing, enhancing simplicity and speed in expression generation. +This process is highly efficient and supports real-time rendering on everyday +devices, leveraging the effectiveness of standard Gaussian Splatting. In +addition, we demonstrate how the GEM can be controlled using a ResNet-based +regression architecture. We show and compare self-reenactment and cross-person +reenactment to state-of-the-art 3D avatar methods, demonstrating higher quality +and better control. A real-time demo showcases the applicability of the GEM +representation. + +
+
+ comment: https://zielon.github.io/gem/ +
+
+
+
+
+ + ☆ Rethinking Image Compression on the Web with Generative AI + + +
+ The rapid growth of the Internet, driven by social media, web browsing, and +video streaming, has made images central to the Web experience, resulting in +significant data transfer and increased webpage sizes. Traditional image +compression methods, while reducing bandwidth, often degrade image quality. +This paper explores a novel approach using generative AI to reconstruct images +at the edge or client-side. We develop a framework that leverages text prompts +and provides additional conditioning inputs like Canny edges and color palettes +to a text-to-image model, achieving up to 99.8% bandwidth savings in the best +cases and 92.6% on average, while maintaining high perceptual similarity. +Empirical analysis and a user study show that our method preserves image +meaning and structure more effectively than traditional compression methods, +offering a promising solution for reducing bandwidth usage and improving +Internet affordability with minimal degradation in image quality. + +
+
+
+
+
+ + ☆ PDiscoFormer: Relaxing Part Discovery Constraints with Vision + Transformers ECCV + + +
+ Computer vision methods that explicitly detect object parts and reason on +them are a step towards inherently interpretable models. Existing approaches +that perform part discovery driven by a fine-grained classification task make +very restrictive assumptions on the geometric properties of the discovered +parts; they should be small and compact. Although this prior is useful in some +cases, in this paper we show that pre-trained transformer-based vision models, +such as self-supervised DINOv2 ViT, enable the relaxation of these constraints. +In particular, we find that a total variation (TV) prior, which allows for +multiple connected components of any size, substantially outperforms previous +work. We test our approach on three fine-grained classification benchmarks: +CUB, PartImageNet and Oxford Flowers, and compare our results to previously +published methods as well as a re-implementation of the state-of-the-art method +PDiscoNet with a transformer-based backbone. We consistently obtain substantial +improvements across the board, both on part discovery metrics and the +downstream classification task, showing that the strong inductive biases in +self-supervised ViT models require to rethink the geometric priors that can be +used for unsupervised part discovery. + +
+
+ comment: Accepted as a main conference paper at the European Conference of + Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ☆ Success or Failure? Analyzing Segmentation Refinement with Few-Shot + Segmentation + + +
+ The purpose of segmentation refinement is to enhance the initial coarse masks +generated by segmentation algorithms. The refined masks are expected to capture +the details and contours of the target objects. Research on segmentation +refinement has developed as a response to the need for high-quality initial +masks. However, to our knowledge, no method has been developed that can +determine the success of segmentation refinement. Such a method could ensure +the reliability of segmentation in applications where the outcome of the +segmentation is important, and fosters innovation in image processing +technologies. To address this research gap, we propose JFS~(Judging From +Support-set), a method to identify the success of segmentation refinement +leveraging a few-shot segmentation (FSS) model. The traditional goal of the +problem in FSS is to find a target object in a query image utilizing target +information given by a support set. However, in our proposed method, we use the +FSS network in a novel way to assess the segmentation refinement. When there +are two masks, a coarse mask and a refined mask from segmentation refinement, +these two masks become support masks. The existing support mask works as a +ground truth mask to judge whether the quality of the refined segmentation is +more accurate than the coarse mask. We first obtained a coarse mask and refined +it using SEPL (SAM Enhanced Pseduo-Labels) to get the two masks. Then, these +become input to FSS model to judge whether the post-processing was successful. +JFS is evaluated on the best and worst cases from SEPL to validate its +effectiveness. The results showed that JFS can determine whether the SEPL is a +success or not. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ LayerShuffle: Enhancing Robustness in Vision Transformers by Randomizing + Layer Execution Order + + +
+ Due to their architecture and how they are trained, artificial neural +networks are typically not robust toward pruning, replacing, or shuffling +layers at test time. However, such properties would be desirable for different +applications, such as distributed neural network architectures where the order +of execution cannot be guaranteed or parts of the network can fail during +inference. In this work, we address these issues through a number of proposed +training approaches for vision transformers whose most important component is +randomizing the execution order of attention modules at training time. We show +that with our proposed approaches, vision transformers are indeed capable to +adapt to arbitrary layer execution orders at test time assuming one tolerates a +reduction (about 20\%) in accuracy at the same model size. We also find that +our trained models can be randomly merged with each other resulting in +functional ("Frankenstein") models without loss of performance compared to the +source models. Finally, we layer-prune our models at test time and find that +their performance declines gracefully. + +
+
+
+
+
+ + ☆ Few-Shot Airway-Tree Modeling using Data-Driven Sparse Priors + + +
+ The lack of large annotated datasets in medical imaging is an intrinsic +burden for supervised Deep Learning (DL) segmentation models. Few-shot learning +approaches are cost-effective solutions to transfer pre-trained models using +only limited annotated data. However, such methods can be prone to overfitting +due to limited data diversity especially when segmenting complex, diverse, and +sparse tubular structures like airways. Furthermore, crafting informative image +representations has played a crucial role in medical imaging, enabling +discriminative enhancement of anatomical details. In this paper, we initially +train a data-driven sparsification module to enhance airways efficiently in +lung CT scans. We then incorporate these sparse representations in a standard +supervised segmentation pipeline as a pretraining step to enhance the +performance of the DL models. Results presented on the ATM public challenge +cohort show the effectiveness of using sparse priors in pre-training, leading +to segmentation Dice score increase by 1% to 10% in full-scale and few-shot +learning scenarios, respectively. + +
+
+ comment: Accepted at 21st IEEE International Symposium on Biomedical Imaging + (ISBI) +
+
+
+
+
+ + ☆ Hyperspectral Dataset and Deep Learning methods for Waste from Electric + and Electronic Equipment Identification (WEEE) + + +
+ Hyperspectral imaging, a rapidly evolving field, has witnessed the ascendancy +of deep learning techniques, supplanting classical feature extraction and +classification methods in various applications. However, many researchers +employ arbitrary architectures for hyperspectral image processing, often +without rigorous analysis of the interplay between spectral and spatial +information. This oversight neglects the implications of combining these two +modalities on model performance. + In this paper, we evaluate the performance of diverse deep learning +architectures for hyperspectral image segmentation. Our analysis disentangles +the impact of different architectures, spanning various spectral and spatial +granularities. Specifically, we investigate the effects of spectral resolution +(capturing spectral information) and spatial texture (conveying spatial +details) on segmentation outcomes. Additionally, we explore the transferability +of knowledge from large pre-trained image foundation models, originally +designed for RGB images, to the hyperspectral domain. + Results show that incorporating spatial information alongside spectral data +leads to improved segmentation results, and that it is essential to further +work on novel architectures comprising spectral and spatial information and on +the adaption of RGB foundation models into the hyperspectral domain. + Furthermore, we contribute to the field by cleaning and publicly releasing +the Tecnalia WEEE Hyperspectral dataset. This dataset contains different +non-ferrous fractions of Waste Electrical and Electronic Equipment (WEEE), +including Copper, Brass, Aluminum, Stainless Steel, and White Copper, spanning +the range of 400 to 1000 nm. + We expect these conclusions can guide novel researchers in the field of +hyperspectral imaging. + +
+
+
+
+
+ + ☆ Segment Any 4D Gaussians + + +
+ Modeling, understanding, and reconstructing the real world are crucial in +XR/VR. Recently, 3D Gaussian Splatting (3D-GS) methods have shown remarkable +success in modeling and understanding 3D scenes. Similarly, various 4D +representations have demonstrated the ability to capture the dynamics of the 4D +world. However, there is a dearth of research focusing on segmentation within +4D representations. In this paper, we propose Segment Any 4D Gaussians (SA4D), +one of the first frameworks to segment anything in the 4D digital world based +on 4D Gaussians. In SA4D, an efficient temporal identity feature field is +introduced to handle Gaussian drifting, with the potential to learn precise +identity features from noisy and sparse input. Additionally, a 4D segmentation +refinement process is proposed to remove artifacts. Our SA4D achieves precise, +high-quality segmentation within seconds in 4D Gaussians and shows the ability +to remove, recolor, compose, and render high-quality anything masks. More demos +are available at: https://jsxzs.github.io/sa4d/. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ Micro-gesture Online Recognition using Learnable Query Points IJCAI 2024 + + +
+ In this paper, we briefly introduce the solution developed by our team, +HFUT-VUT, for the Micro-gesture Online Recognition track in the MiGA challenge +at IJCAI 2024. The Micro-gesture Online Recognition task involves identifying +the category and locating the start and end times of micro-gestures in video +clips. Compared to the typical Temporal Action Detection task, the +Micro-gesture Online Recognition task focuses more on distinguishing between +micro-gestures and pinpointing the start and end times of actions. Our solution +ranks 2nd in the Micro-gesture Online Recognition track. + +
+
+ comment: Technical Report of HFUT-VUT for the MiGA challenge at IJCAI 2024 +
+
+
+
+
+ + ☆ Dude: Dual Distribution-Aware Context Prompt Learning For Large + Vision-Language Model + + +
+ Prompt learning methods are gaining increasing attention due to their ability +to customize large vision-language models to new domains using pre-trained +contextual knowledge and minimal training data. However, existing works +typically rely on optimizing unified prompt inputs, often struggling with +fine-grained classification tasks due to insufficient discriminative +attributes. To tackle this, we consider a new framework based on a dual context +of both domain-shared and class-specific contexts, where the latter is +generated by Large Language Models (LLMs) such as GPTs. Such dual prompt +methods enhance the model's feature representation by joining implicit and +explicit factors encoded in LLM knowledge. Moreover, we formulate the +Unbalanced Optimal Transport (UOT) theory to quantify the relationships between +constructed prompts and visual tokens. Through partial matching, UOT can +properly align discrete sets of visual tokens and prompt embeddings under +different mass distributions, which is particularly valuable for handling +irrelevant or noisy elements, ensuring that the preservation of mass does not +restrict transport solutions. Furthermore, UOT's characteristics integrate +seamlessly with image augmentation, expanding the training sample pool while +maintaining a reasonable distance between perturbed images and prompt inputs. +Extensive experiments across few-shot classification and adapter settings +substantiate the superiority of our model over current state-of-the-art +baselines. + +
+
+ comment: Version 1 +
+
+
+
+
+ + ☆ Optimizing the image correction pipeline for pedestrian detection in the + thermal-infrared domain + + +
+ Infrared imagery can help in low-visibility situations such as fog and +low-light scenarios, but it is prone to thermal noise and requires further +processing and correction. This work studies the effect of different infrared +processing pipelines on the performance of a pedestrian detection in an urban +environment, similar to autonomous driving scenarios. Detection on infrared +images is shown to outperform that on visible images, but the infrared +correction pipeline is crucial since the models cannot extract information from +raw infrared images. Two thermal correction pipelines are studied, the shutter +and the shutterless pipes. Experiments show that some correction algorithms +like spatial denoising are detrimental to performance even if they increase +visual quality for a human observer. Other algorithms like destriping and, to a +lesser extent, temporal denoising, increase computational time, but have some +role to play in increasing detection accuracy. As it stands, the optimal +trade-off for speed and accuracy is simply to use the shutterless pipe with a +tonemapping algorithm only, for autonomous driving applications within varied +environments. + +
+
+ comment: 9 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ Rethinking Data Input for Point Cloud Upsampling + + +
+ In recent years, point cloud upsampling has been widely applied in fields +such as 3D reconstruction and surface generation. However, existing point cloud +upsampling inputs are all patch based, and there is no research discussing the +differences and principles between point cloud model full input and patch based +input. In order to compare with patch based point cloud input, this article +proposes a new data input method, which divides the full point cloud model to +ensure shape integrity while training PU-GCN. This article was validated on the +PU1K and ABC datasets, but the results showed that Patch based performance is +better than model based full input i.e. Average Segment input. Therefore, this +article explores the data input factors and model modules that affect the +upsampling results of point clouds. + +
+
+ comment: 16 pages, 6 figures +
+
+
+
+
+ + ☆ VCD-Texture: Variance Alignment based 3D-2D Co-Denoising for Text-Guided + Texturing ECCV 2024 + + +
+ Recent research on texture synthesis for 3D shapes benefits a lot from +dramatically developed 2D text-to-image diffusion models, including +inpainting-based and optimization-based approaches. However, these methods +ignore the modal gap between the 2D diffusion model and 3D objects, which +primarily render 3D objects into 2D images and texture each image separately. +In this paper, we revisit the texture synthesis and propose a Variance +alignment based 3D-2D Collaborative Denoising framework, dubbed VCD-Texture, to +address these issues. Formally, we first unify both 2D and 3D latent feature +learning in diffusion self-attention modules with re-projected 3D attention +receptive fields. Subsequently, the denoised multi-view 2D latent features are +aggregated into 3D space and then rasterized back to formulate more consistent +2D predictions. However, the rasterization process suffers from an intractable +variance bias, which is theoretically addressed by the proposed variance +alignment, achieving high-fidelity texture synthesis. Moreover, we present an +inpainting refinement to further improve the details with conflicting regions. +Notably, there is not a publicly available benchmark to evaluate texture +synthesis, which hinders its development. Thus we construct a new evaluation +set built upon three open-source 3D datasets and propose to use four metrics to +thoroughly validate the texturing performance. Comprehensive experiments +demonstrate that VCD-Texture achieves superior performance against other +counterparts. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Robust Multimodal Learning via Representation Decoupling ECCV2024 + + +
+ Multimodal learning robust to missing modality has attracted increasing +attention due to its practicality. Existing methods tend to address it by +learning a common subspace representation for different modality combinations. +However, we reveal that they are sub-optimal due to their implicit constraint +on intra-class representation. Specifically, the sample with different +modalities within the same class will be forced to learn representations in the +same direction. This hinders the model from capturing modality-specific +information, resulting in insufficient learning. To this end, we propose a +novel Decoupled Multimodal Representation Network (DMRNet) to assist robust +multimodal learning. Specifically, DMRNet models the input from different +modality combinations as a probabilistic distribution instead of a fixed point +in the latent space, and samples embeddings from the distribution for the +prediction module to calculate the task loss. As a result, the direction +constraint from the loss minimization is blocked by the sampled representation. +This relaxes the constraint on the inference representation and enables the +model to capture the specific information for different modality combinations. +Furthermore, we introduce a hard combination regularizer to prevent DMRNet from +unbalanced training by guiding it to pay more attention to hard modality +combinations. Finally, extensive experiments on multimodal classification and +segmentation tasks demonstrate that the proposed DMRNet outperforms the +state-of-the-art significantly. + +
+
+ comment: ECCV2024 17 pages +
+
+
+
+
+ + ☆ Multi-modal Masked Siamese Network Improves Chest X-Ray Representation + Learning + + +
+ Self-supervised learning methods for medical images primarily rely on the +imaging modality during pretraining. While such approaches deliver promising +results, they do not leverage associated patient or scan information collected +within Electronic Health Records (EHR). Here, we propose to incorporate EHR +data during self-supervised pretraining with a Masked Siamese Network (MSN) to +enhance the quality of chest X-ray representations. We investigate three types +of EHR data, including demographic, scan metadata, and inpatient stay +information. We evaluate our approach on three publicly available chest X-ray +datasets, MIMIC-CXR, CheXpert, and NIH-14, using two vision transformer (ViT) +backbones, specifically ViT-Tiny and ViT-Small. In assessing the quality of the +representations via linear evaluation, our proposed method demonstrates +significant improvement compared to vanilla MSN and state-of-the-art +self-supervised learning baselines. Our work highlights the potential of +EHR-enhanced self-supervised pre-training for medical imaging. The code is +publicly available at: https://github.com/nyuad-cai/CXR-EHR-MSN + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Hard-Attention Gates with Gradient Routing for Endoscopic Image + Computing + + +
+ To address overfitting and enhance model generalization in +gastroenterological polyp size assessment, our study introduces +Feature-Selection Gates (FSG) or Hard-Attention Gates (HAG) alongside Gradient +Routing (GR) for dynamic feature selection. This technique aims to boost +Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) by +promoting sparse connectivity, thereby reducing overfitting and enhancing +generalization. HAG achieves this through sparsification with learnable +weights, serving as a regularization strategy. GR further refines this process +by optimizing HAG parameters via dual forward passes, independently from the +main model, to improve feature re-weighting. Our evaluation spanned multiple +datasets, including CIFAR-100 for a broad impact assessment and specialized +endoscopic datasets (REAL-Colon, Misawa, and SUN) focusing on polyp size +estimation, covering over 200 polyps in more than 370,000 frames. The findings +indicate that our HAG-enhanced networks substantially enhance performance in +both binary and triclass classification tasks related to polyp sizing. +Specifically, CNNs experienced an F1 Score improvement to 87.8% in binary +classification, while in triclass classification, the ViT-T model reached an F1 +Score of 76.5%, outperforming traditional CNNs and ViT-T models. To facilitate +further research, we are releasing our codebase, which includes implementations +for CNNs, multistream CNNs, ViT, and HAG-augmented variants. This resource aims +to standardize the use of endoscopic datasets, providing public +training-validation-testing splits for reliable and comparable research in +gastroenterological polyp size estimation. The codebase is available at +github.com/cosmoimd/feature-selection-gates. + +
+
+ comment: Attention Gates, Hard-Attention Gates, Gradient Routing, Feature + Selection Gates, Endoscopy, Medical Image Processing, Computer Vision +
+
+
+
+
+ + ☆ Graph-Guided Test-Time Adaptation for Glaucoma Diagnosis using Fundus + Photography MICCAI + + +
+ Glaucoma is a leading cause of irreversible blindness worldwide. While deep +learning approaches using fundus images have largely improved early diagnosis +of glaucoma, variations in images from different devices and locations (known +as domain shifts) challenge the use of pre-trained models in real-world +settings. To address this, we propose a novel Graph-guided Test-Time Adaptation +(GTTA) framework to generalize glaucoma diagnosis models to unseen test +environments. GTTA integrates the topological information of fundus images into +the model training, enhancing the model's transferability and reducing the risk +of learning spurious correlation. During inference, GTTA introduces a novel +test-time training objective to make the source-trained classifier +progressively adapt to target patterns with reliable class conditional +estimation and consistency regularization. Experiments on cross-domain glaucoma +diagnosis benchmarks demonstrate the superiority of the overall framework and +individual components under different backbone networks. + +
+
+ comment: 11 pages, 3 figures, 3 tables, submitted to MICCAI +
+
+
+
+
+ + ☆ Unsupervised Learning of Category-Level 3D Pose from Object-Centric + Videos + + +
+ Category-level 3D pose estimation is a fundamentally important problem in +computer vision and robotics, e.g. for embodied agents or to train 3D +generative models. However, so far methods that estimate the category-level +object pose require either large amounts of human annotations, CAD models or +input from RGB-D sensors. In contrast, we tackle the problem of learning to +estimate the category-level 3D pose only from casually taken object-centric +videos without human supervision. We propose a two-step pipeline: First, we +introduce a multi-view alignment procedure that determines canonical camera +poses across videos with a novel and robust cyclic distance formulation for +geometric and appearance matching using reconstructed coarse meshes and DINOv2 +features. In a second step, the canonical poses and reconstructed meshes enable +us to train a model for 3D pose estimation from a single image. In particular, +our model learns to estimate dense correspondences between images and a +prototypical 3D template by predicting, for each pixel in a 2D image, a feature +vector of the corresponding vertex in the template mesh. We demonstrate that +our method outperforms all baselines at the unsupervised alignment of +object-centric videos by a large margin and provides faithful and robust +predictions in-the-wild. Our code and data is available at +https://github.com/GenIntel/uns-obj-pose3d. + +
+
+
+
+
+ + ☆ Self-Supervised Representation Learning for Adversarial Attack Detection ECCV 2024 + + +
+ Supervised learning-based adversarial attack detection methods rely on a +large number of labeled data and suffer significant performance degradation +when applying the trained model to new domains. In this paper, we propose a +self-supervised representation learning framework for the adversarial attack +detection task to address this drawback. Firstly, we map the pixels of +augmented input images into an embedding space. Then, we employ the +prototype-wise contrastive estimation loss to cluster prototypes as latent +variables. Additionally, drawing inspiration from the concept of memory banks, +we introduce a discrimination bank to distinguish and learn representations for +each individual instance that shares the same or a similar prototype, +establishing a connection between instances and their associated prototypes. We +propose a parallel axial-attention (PAA)-based encoder to facilitate the +training process by parallel training over height- and width-axis of attention +maps. Experimental results show that, compared to various benchmark +self-supervised vision learning models and supervised adversarial attack +detection methods, the proposed model achieves state-of-the-art performance on +the adversarial attack detection task across a wide range of images. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Multi-Branch Auxiliary Fusion YOLO with Re-parameterization + Heterogeneous Convolutional for accurate object detection + + +
+ Due to the effective performance of multi-scale feature fusion, Path +Aggregation FPN (PAFPN) is widely employed in YOLO detectors. However, it +cannot efficiently and adaptively integrate high-level semantic information +with low-level spatial information simultaneously. We propose a new model named +MAF-YOLO in this paper, which is a novel object detection framework with a +versatile neck named Multi-Branch Auxiliary FPN (MAFPN). Within MAFPN, the +Superficial Assisted Fusion (SAF) module is designed to combine the output of +the backbone with the neck, preserving an optimal level of shallow information +to facilitate subsequent learning. Meanwhile, the Advanced Assisted Fusion +(AAF) module deeply embedded within the neck conveys a more diverse range of +gradient information to the output layer. + Furthermore, our proposed Re-parameterized Heterogeneous Efficient Layer +Aggregation Network (RepHELAN) module ensures that both the overall model +architecture and convolutional design embrace the utilization of heterogeneous +large convolution kernels. Therefore, this guarantees the preservation of +information related to small targets while simultaneously achieving the +multi-scale receptive field. Finally, taking the nano version of MAF-YOLO for +example, it can achieve 42.4% AP on COCO with only 3.76M learnable parameters +and 10.51G FLOPs, and approximately outperforms YOLOv8n by about 5.1%. The +source code of this work is available at: +https://github.com/yang-0201/MAF-YOLO. + +
+
+
+
+
+ + ☆ ZARRIO @ Ego4D Short Term Object Interaction Anticipation Challenge: + Leveraging Affordances and Attention-based models for STA + + +
+ Short-Term object-interaction Anticipation (STA) consists of detecting the +location of the next-active objects, the noun and verb categories of the +interaction, and the time to contact from the observation of egocentric video. +We propose STAformer, a novel attention-based architecture integrating +frame-guided temporal pooling, dual image-video attention, and multi-scale +feature fusion to support STA predictions from an image-input video pair. +Moreover, we introduce two novel modules to ground STA predictions on human +behavior by modeling affordances. First, we integrate an environment affordance +model which acts as a persistent memory of interactions that can take place in +a given physical scene. Second, we predict interaction hotspots from the +observation of hands and object trajectories, increasing confidence in STA +predictions localized around the hotspot. On the test set, our results obtain a +final 33.5 N mAP, 17.25 N+V mAP, 11.77 N+{\delta} mAP and 6.75 Overall top-5 +mAP metric when trained on the v2 training dataset. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2406.01194 +
+
+
+
+
+ + ☆ Towards Context-aware Support for Color Vision Deficiency: An Approach + Integrating LLM and AR + + +
+ People with color vision deficiency often face challenges in distinguishing +colors such as red and green, which can complicate daily tasks and require the +use of assistive tools or environmental adjustments. Current support tools +mainly focus on presentation-based aids, like the color vision modes found in +iPhone accessibility settings. However, offering context-aware support, like +indicating the doneness of meat, remains a challenge since task-specific +solutions are not cost-effective for all possible scenarios. To address this, +our paper proposes an application that provides contextual and autonomous +assistance. This application is mainly composed of: (i) an augmented reality +interface that efficiently captures context; and (ii) a multi-modal large +language model-based reasoner that serves to cognitize the context and then +reason about the appropriate support contents. Preliminary user experiments +with two color vision deficient users across five different scenarios have +demonstrated the effectiveness and universality of our application. + +
+
+
+
+
+ + ☆ Shape Prior Segmentation Guided by Harmonic Beltrami Signature + + +
+ This paper presents a novel shape prior segmentation method guided by the +Harmonic Beltrami Signature (HBS). The HBS is a shape representation fully +capturing 2D simply connected shapes, exhibiting resilience against +perturbations and invariance to translation, rotation, and scaling. The +proposed method integrates the HBS within a quasi-conformal topology preserving +segmentation framework, leveraging shape prior knowledge to significantly +enhance segmentation performance, especially for low-quality or occluded +images. The key innovation lies in the bifurcation of the optimization process +into two iterative stages: 1) The computation of a quasi-conformal deformation +map, which transforms the unit disk into the targeted segmentation area, driven +by image data and other regularization terms; 2) The subsequent refinement of +this map is contingent upon minimizing the $L_2$ distance between its Beltrami +coefficient and the reference HBS. This shape-constrained refinement ensures +that the segmentation adheres to the reference shape(s) by exploiting the +inherent invariance, robustness, and discerning shape discriminative +capabilities afforded by the HBS. Extensive experiments on synthetic and +real-world images validate the method's ability to improve segmentation +accuracy over baselines, eliminate preprocessing requirements, resist noise +corruption, and flexibly acquire and apply shape priors. Overall, the HBS +segmentation framework offers an efficient strategy to robustly incorporate the +shape prior knowledge, thereby advancing critical low-level vision tasks. + +
+
+ comment: 34 pages, 15 figures +
+
+
+
+
+ + ☆ Data-Driven Tissue- and Subject-Specific Elastic Regularization for + Medical Image Registration MICCAI 2024 + + +
+ Physics-inspired regularization is desired for intra-patient image +registration since it can effectively capture the biomechanical characteristics +of anatomical structures. However, a major challenge lies in the reliance on +physical parameters: Parameter estimations vary widely across the literature, +and the physical properties themselves are inherently subject-specific. In this +work, we introduce a novel data-driven method that leverages hypernetworks to +learn the tissue-dependent elasticity parameters of an elastic regularizer. +Notably, our approach facilitates the estimation of patient-specific parameters +without the need to retrain the network. We evaluate our method on three +publicly available 2D and 3D lung CT and cardiac MR datasets. We find that with +our proposed subject-specific tissue-dependent regularization, a higher +registration quality is achieved across all datasets compared to using a global +regularizer. The code is available at +https://github.com/compai-lab/2024-miccai-reithmeir. + +
+
+ comment: Accepted at MICCAI 2024 +
+
+
+
+
+ + ☆ Segmenting Medical Images: From UNet to Res-UNet and nnUNet + + +
+ This study provides a comparative analysis of deep learning models including +UNet, Res-UNet, Attention Res-UNet, and nnUNet, and evaluates their performance +in brain tumour, polyp, and multi-class heart segmentation tasks. The analysis +focuses on precision, accuracy, recall, Dice Similarity Coefficient (DSC), and +Intersection over Union (IoU) to assess their clinical applicability. In brain +tumour segmentation, Res-UNet and nnUNet significantly outperformed UNet, with +Res-UNet leading in DSC and IoU scores, indicating superior accuracy in tumour +delineation. Meanwhile, nnUNet excelled in recall and accuracy, which are +crucial for reliable tumour detection in clinical diagnosis and planning. In +polyp detection, nnUNet was the most effective, achieving the highest metrics +across all categories and proving itself as a reliable diagnostic tool in +endoscopy. In the complex task of heart segmentation, Res-UNet and Attention +Res-UNet were outstanding in delineating the left ventricle, with Res-UNet also +leading in right ventricle segmentation. nnUNet was unmatched in myocardium +segmentation, achieving top scores in precision, recall, DSC, and IoU. The +conclusion notes that although Res-UNet occasionally outperforms nnUNet in +specific metrics, the differences are quite small. Moreover, nnUNet +consistently shows superior overall performance across the experiments. +Particularly noted for its high recall and accuracy, which are crucial in +clinical settings to minimize misdiagnosis and ensure timely treatment, +nnUNet's robust performance in crucial metrics across all tested categories +establishes it as the most effective model for these varied and complex +segmentation tasks. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ☆ MobileFlow: A Multimodal LLM For Mobile GUI Agent + + +
+ Currently, the integration of mobile Graphical User Interfaces (GUIs) is +ubiquitous in most people's daily lives. And the ongoing evolution of +multimodal large-scale models, such as GPT-4v, Qwen-VL-Max, has significantly +bolstered the capabilities of GUI comprehension and user action analysis, +showcasing the potentiality of intelligent GUI assistants. However, current GUI +Agents often need to access page layout information through calling system +APIs, which may pose privacy risks. Fixing GUI (such as mobile interfaces) to a +certain low resolution might result in the loss of fine-grained image details. +At the same time, the multimodal large models built for GUI Agents currently +have poor understanding and decision-making abilities for Chinese GUI +interfaces, making them difficult to apply to a large number of Chinese apps. +This paper introduces MobileFlow, a multimodal large language model +meticulously crafted for mobile GUI agents. Transforming from the open-source +model Qwen-VL-Chat into GUI domain, MobileFlow contains approximately 21 +billion parameters and is equipped with novel hybrid visual encoders, making it +possible for variable resolutions of image inputs and good support for +multilingual GUI. By incorporating Mixture of Experts (MoE) expansions and +pioneering alignment training strategies, MobileFlow has the capacity to fully +interpret image data and comprehend user instructions for GUI interaction +tasks. Finally, MobileFlow outperforms Qwen-VL-Max and GPT-4v in terms of task +execution by GUI agents on both public and our proposed evaluation metrics, and +has been successfully deployed in real-world business contexts, proving its +effectiveness for practical applications. + +
+
+
+
+
+ + ☆ CanonicalFusion: Generating Drivable 3D Human Avatars from Multiple + Images ECCV 2024 + + +
+ We present a novel framework for reconstructing animatable human avatars from +multiple images, termed CanonicalFusion. Our central concept involves +integrating individual reconstruction results into the canonical space. To be +specific, we first predict Linear Blend Skinning (LBS) weight maps and depth +maps using a shared-encoder-dual-decoder network, enabling direct +canonicalization of the 3D mesh from the predicted depth maps. Here, instead of +predicting high-dimensional skinning weights, we infer compressed skinning +weights, i.e., 3-dimensional vector, with the aid of pre-trained MLP networks. +We also introduce a forward skinning-based differentiable rendering scheme to +merge the reconstructed results from multiple images. This scheme refines the +initial mesh by reposing the canonical mesh via the forward skinning and by +minimizing photometric and geometric errors between the rendered and the +predicted results. Our optimization scheme considers the position and color of +vertices as well as the joint angles for each image, thereby mitigating the +negative effects of pose errors. We conduct extensive experiments to +demonstrate the effectiveness of our method and compare our CanonicalFusion +with state-of-the-art methods. Our source codes are available at +https://github.com/jsshin98/CanonicalFusion. + +
+
+ comment: ECCV 2024 Accepted (18 pages, 9 figures) +
+
+
+
+
+ + ☆ Learning Geometric Invariant Features for Classification of Vector + Polygons with Graph Message-passing Neural Network + + +
+ Geometric shape classification of vector polygons remains a non-trivial +learning task in spatial analysis. Previous studies mainly focus on devising +deep learning approaches for representation learning of rasterized vector +polygons, whereas the study of discrete representations of polygons and +subsequent deep learning approaches have not been fully investigated. In this +study, we investigate a graph representation of vector polygons and propose a +novel graph message-passing neural network (PolyMP) to learn the +geometric-invariant features for shape classification of polygons. Through +extensive experiments, we show that the graph representation of polygons +combined with a permutation-invariant graph message-passing neural network +achieves highly robust performances on benchmark datasets (i.e., synthetic +glyph and real-world building footprint datasets) as compared to baseline +methods. We demonstrate that the proposed graph-based PolyMP network enables +the learning of expressive geometric features invariant to geometric +transformations of polygons (i.e., translation, rotation, scaling and shearing) +and is robust to trivial vertex removals of polygons. We further show the +strong generalizability of PolyMP, which enables generalizing the learned +geometric features from the synthetic glyph polygons to the real-world building +footprints. + +
+
+
+
+
+ + ☆ TF-SASM: Training-free Spatial-aware Sparse Memory for Multi-object + Tracking + + +
+ Multi-object tracking (MOT) in computer vision remains a significant +challenge, requiring precise localization and continuous tracking of multiple +objects in video sequences. This task is crucial for various applications, +including action recognition and behavior analysis. Key challenges include +occlusion, reidentification, tracking fast-moving objects, and handling camera +motion artifacts. Past research has explored tracking-by-detection methods and +end-to-end models, with recent attention on tracking-by-attention approaches +leveraging transformer architectures. The emergence of data sets that emphasize +robust reidentification, such as DanceTrack, has highlighted the need for +effective solutions. While memory-based approaches have shown promise, they +often suffer from high computational complexity and memory usage. We propose a +novel sparse memory approach that selectively stores critical features based on +object motion and overlapping awareness, aiming to enhance efficiency while +minimizing redundancy. Building upon the MOTRv2 model, a hybrid of +tracking-by-attention and tracking-by-detection, we introduce a training-free +memory designed to bolster reidentification capabilities and preserve the +model's flexibility. Our memory approach achieves significant improvements over +MOTRv2 in the DanceTrack test set, demonstrating a gain of 1.1\% in HOTA +metrics and 2.1\% in IDF1 score. + +
+
+
+
+
+ + ☆ LMSeg: A deep graph message-passing network for efficient and accurate + semantic segmentation of large-scale 3D landscape meshes + + +
+ Semantic segmentation of large-scale 3D landscape meshes is pivotal for +various geospatial applications, including spatial analysis, automatic mapping +and localization of target objects, and urban planning and development. This +requires an efficient and accurate 3D perception system to understand and +analyze real-world environments. However, traditional mesh segmentation methods +face challenges in accurately segmenting small objects and maintaining +computational efficiency due to the complexity and large size of 3D landscape +mesh datasets. This paper presents an end-to-end deep graph message-passing +network, LMSeg, designed to efficiently and accurately perform semantic +segmentation on large-scale 3D landscape meshes. The proposed approach takes +the barycentric dual graph of meshes as inputs and applies deep message-passing +neural networks to hierarchically capture the geometric and spatial features +from the barycentric graph structures and learn intricate semantic information +from textured meshes. The hierarchical and local pooling of the barycentric +graph, along with the effective geometry aggregation modules of LMSeg, enable +fast inference and accurate segmentation of small-sized and irregular mesh +objects in various complex landscapes. Extensive experiments on two benchmark +datasets (natural and urban landscapes) demonstrate that LMSeg significantly +outperforms existing learning-based segmentation methods in terms of object +segmentation accuracy and computational efficiency. Furthermore, our method +exhibits strong generalization capabilities across diverse landscapes and +demonstrates robust resilience against varying mesh densities and landscape +topologies. + +
+
+
+
+
+ + ☆ SSP-GNN: Learning to Track via Bilevel Optimization + + +
+ We propose a graph-based tracking formulation for multi-object tracking (MOT) +where target detections contain kinematic information and re-identification +features (attributes). Our method applies a successive shortest paths (SSP) +algorithm to a tracking graph defined over a batch of frames. The edge costs in +this tracking graph are computed via a message-passing network, a graph neural +network (GNN) variant. The parameters of the GNN, and hence, the tracker, are +learned end-to-end on a training set of example ground-truth tracks and +detections. Specifically, learning takes the form of bilevel optimization +guided by our novel loss function. We evaluate our algorithm on simulated +scenarios to understand its sensitivity to scenario aspects and model +hyperparameters. Across varied scenario complexities, our method compares +favorably to a strong baseline. + +
+
+
+
+
+ + ☆ Towards Stable 3D Object Detection + + +
+ In autonomous driving, the temporal stability of 3D object detection greatly +impacts the driving safety. However, the detection stability cannot be accessed +by existing metrics such as mAP and MOTA, and consequently is less explored by +the community. To bridge this gap, this work proposes Stability Index (SI), a +new metric that can comprehensively evaluate the stability of 3D detectors in +terms of confidence, box localization, extent, and heading. By benchmarking +state-of-the-art object detectors on the Waymo Open Dataset, SI reveals +interesting properties of object stability that have not been previously +discovered by other metrics. To help models improve their stability, we further +introduce a general and effective training strategy, called Prediction +Consistency Learning (PCL). PCL essentially encourages the prediction +consistency of the same objects under different timestamps and augmentations, +leading to enhanced detection stability. Furthermore, we examine the +effectiveness of PCL with the widely-used CenterPoint, and achieve a remarkable +SI of 86.00 for vehicle class, surpassing the baseline by 5.48. We hope our +work could serve as a reliable baseline and draw the community's attention to +this crucial issue in 3D object detection. Codes will be made publicly +available. + +
+
+
+
+
+ + ☆ MARS: Paying more attention to visual attributes for text-based person + search + + +
+ Text-based person search (TBPS) is a problem that gained significant interest +within the research community. The task is that of retrieving one or more +images of a specific individual based on a textual description. The multi-modal +nature of the task requires learning representations that bridge text and image +data within a shared latent space. Existing TBPS systems face two major +challenges. One is defined as inter-identity noise that is due to the inherent +vagueness and imprecision of text descriptions and it indicates how +descriptions of visual attributes can be generally associated to different +people; the other is the intra-identity variations, which are all those +nuisances e.g. pose, illumination, that can alter the visual appearance of the +same textual attributes for a given subject. To address these issues, this +paper presents a novel TBPS architecture named MARS +(Mae-Attribute-Relation-Sensitive), which enhances current state-of-the-art +models by introducing two key components: a Visual Reconstruction Loss and an +Attribute Loss. The former employs a Masked AutoEncoder trained to reconstruct +randomly masked image patches with the aid of the textual description. In doing +so the model is encouraged to learn more expressive representations and +textual-visual relations in the latent space. The Attribute Loss, instead, +balances the contribution of different types of attributes, defined as +adjective-noun chunks of text. This loss ensures that every attribute is taken +into consideration in the person retrieval process. Extensive experiments on +three commonly used datasets, namely CUHK-PEDES, ICFG-PEDES, and RSTPReid, +report performance improvements, with significant gains in the mean Average +Precision (mAP) metric w.r.t. the current state of the art. + +
+
+
+
+
+ + ☆ Research, Applications and Prospects of Event-Based Pedestrian + Detection: A Survey + + +
+ Event-based cameras, inspired by the biological retina, have evolved into +cutting-edge sensors distinguished by their minimal power requirements, +negligible latency, superior temporal resolution, and expansive dynamic range. +At present, cameras used for pedestrian detection are mainly frame-based +imaging sensors, which have suffered from lethargic response times and hefty +data redundancy. In contrast, event-based cameras address these limitations by +eschewing extraneous data transmissions and obviating motion blur in high-speed +imaging scenarios. On pedestrian detection via event-based cameras, this paper +offers an exhaustive review of research and applications particularly in the +autonomous driving context. Through methodically scrutinizing relevant +literature, the paper outlines the foundational principles, developmental +trajectory, and the comparative merits and demerits of eventbased detection +relative to traditional frame-based methodologies. This review conducts +thorough analyses of various event stream inputs and their corresponding +network models to evaluate their applicability across diverse operational +environments. It also delves into pivotal elements such as crucial datasets and +data acquisition techniques essential for advancing this technology, as well as +advanced algorithms for processing event stream data. Culminating with a +synthesis of the extant landscape, the review accentuates the unique advantages +and persistent challenges inherent in event-based pedestrian detection, +offering a prognostic view on potential future developments in this +fast-progressing field. + +
+
+
+
+
+ + ☆ Fine-grained Dynamic Network for Generic Event Boundary Detection ECCV 2024 + + +
+ Generic event boundary detection (GEBD) aims at pinpointing event boundaries +naturally perceived by humans, playing a crucial role in understanding +long-form videos. Given the diverse nature of generic boundaries, spanning +different video appearances, objects, and actions, this task remains +challenging. Existing methods usually detect various boundaries by the same +protocol, regardless of their distinctive characteristics and detection +difficulties, resulting in suboptimal performance. Intuitively, a more +intelligent and reasonable way is to adaptively detect boundaries by +considering their special properties. In light of this, we propose a novel +dynamic pipeline for generic event boundaries named DyBDet. By introducing a +multi-exit network architecture, DyBDet automatically learns the subnet +allocation to different video snippets, enabling fine-grained detection for +various boundaries. Besides, a multi-order difference detector is also proposed +to ensure generic boundaries can be effectively identified and adaptively +processed. Extensive experiments on the challenging Kinetics-GEBD and TAPOS +datasets demonstrate that adopting the dynamic strategy significantly benefits +GEBD tasks, leading to obvious improvements in both performance and efficiency +compared to the current state-of-the-art. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Variational Partial Group Convolutions for Input-Aware Partial + Equivariance of Rotations and Color-Shifts ICML2024 + + +
+ Group Equivariant CNNs (G-CNNs) have shown promising efficacy in various +tasks, owing to their ability to capture hierarchical features in an +equivariant manner. However, their equivariance is fixed to the symmetry of the +whole group, limiting adaptability to diverse partial symmetries in real-world +datasets, such as limited rotation symmetry of handwritten digit images and +limited color-shift symmetry of flower images. Recent efforts address this +limitation, one example being Partial G-CNN which restricts the output group +space of convolution layers to break full equivariance. However, such an +approach still fails to adjust equivariance levels across data. In this paper, +we propose a novel approach, Variational Partial G-CNN (VP G-CNN), to capture +varying levels of partial equivariance specific to each data instance. VP G-CNN +redesigns the distribution of the output group elements to be conditioned on +input data, leveraging variational inference to avoid overfitting. This enables +the model to adjust its equivariance levels according to the needs of +individual data points. Additionally, we address training instability inherent +in discrete group equivariance models by redesigning the reparametrizable +distribution. We demonstrate the effectiveness of VP G-CNN on both toy and +real-world datasets, including MNIST67-180, CIFAR10, ColorMNIST, and +Flowers102. Our results show robust performance, even in uncertainty metrics. + +
+
+ comment: ICML2024 +
+
+
+
+
+ + ☆ Parametric Curve Segment Extraction by Support Regions + + +
+ We introduce a method to extract curve segments in parametric form from the +image directly using the Laplacian of Gaussian (LoG) filter response. Our +segmentation gives convex and concave curves. To do so, we form curve support +regions by grouping pixels of the thresholded filter response. Then, we model +each support region boundary by Fourier series and extract the corresponding +parametric curve segment. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Efficient Detection of Long Consistent Cycles and its Application to + Distributed Synchronization + + +
+ Group synchronization plays a crucial role in global pipelines for Structure +from Motion (SfM). Its formulation is nonconvex and it is faced with highly +corrupted measurements. Cycle consistency has been effective in addressing +these challenges. However, computationally efficient solutions are needed for +cycles longer than three, especially in practical scenarios where 3-cycles are +unavailable. To overcome this computational bottleneck, we propose an algorithm +for group synchronization that leverages information from cycles of lengths +ranging from three to six with a time complexity of order $O(n^3)$ (or +$O(n^{2.373})$ when using a faster matrix multiplication algorithm). We +establish non-trivial theory for this and related methods that achieves +competitive sample complexity, assuming the uniform corruption model. To +advocate the practical need for our method, we consider distributed group +synchronization, which requires at least 4-cycles, and we illustrate +state-of-the-art performance by our method in this context. + +
+
+
+
+
+ + ☆ Unsupervised Video Summarization via Reinforcement Learning and a + Trained Evaluator + + +
+ This paper presents a novel approach for unsupervised video summarization +using reinforcement learning. It aims to address the existing limitations of +current unsupervised methods, including unstable training of adversarial +generator-discriminator architectures and reliance on hand-crafted reward +functions for quality evaluation. The proposed method is based on the concept +that a concise and informative summary should result in a reconstructed video +that closely resembles the original. The summarizer model assigns an importance +score to each frame and generates a video summary. In the proposed scheme, +reinforcement learning, coupled with a unique reward generation pipeline, is +employed to train the summarizer model. The reward generation pipeline trains +the summarizer to create summaries that lead to improved reconstructions. It +comprises a generator model capable of reconstructing masked frames from a +partially masked video, along with a reward mechanism that compares the +reconstructed video from the summary against the original. The video generator +is trained in a self-supervised manner to reconstruct randomly masked frames, +enhancing its ability to generate accurate summaries. This training pipeline +results in a summarizer model that better mimics human-generated video +summaries compared to methods relying on hand-crafted rewards. The training +process consists of two stable and isolated training steps, unlike adversarial +architectures. Experimental results demonstrate promising performance, with +F-scores of 62.3 and 54.5 on TVSum and SumMe datasets, respectively. +Additionally, the inference stage is 300 times faster than our previously +reported state-of-the-art method. + +
+
+
+
+
+ + ☆ Second Place Solution of WSDM2023 Toloka Visual Question Answering + Challenge WSDM2023 + + +
+ In this paper, we present our solution for the WSDM2023 Toloka Visual +Question Answering Challenge. Inspired by the application of multimodal +pre-trained models to various downstream tasks(e.g., visual question answering, +visual grounding, and cross-modal retrieval), we approached this competition as +a visual grounding task, where the input is an image and a question, guiding +the model to answer the question and display the answer as a bounding box on +the image. We designed a three-stage solution for this task. Specifically, we +used the visual-language pre-trained model OFA as the foundation. In the first +stage, we constructed a large-scale synthetic dataset similar to the +competition dataset and coarse-tuned the model to learn generalized semantic +information. In the second stage, we treated the competition task as a visual +grounding task, loaded the weights from the previous stage, and continued to +fine-tune the model on the competition dataset, transferring the semantic +information learned in the first stage to the competition task. Finally, we +designed a bounding box matching and replacing post-processing strategy to +correct the model's prediction results. Our team achieved a score of 76.342 on +the final leaderboard, ranking second. + +
+
+ comment: Second Place of WSDM2023 Toloka Visual Question Answering Challenge +
+
+
+
+
+ + ☆ FeatureSORT: Essential Features for Effective Tracking + + +
+ In this work, we introduce a novel tracker designed for online multiple +object tracking with a focus on being simple, while being effective. we provide +multiple feature modules each of which stands for a particular appearance +information. By integrating distinct appearance features, including clothing +color, style, and target direction, alongside a ReID network for robust +embedding extraction, our tracker significantly enhances online tracking +accuracy. Additionally, we propose the incorporation of a stronger detector and +also provide an advanced post processing methods that further elevate the +tracker's performance. During real time operation, we establish measurement to +track associated distance function which includes the IoU, direction, color, +style, and ReID features similarity information, where each metric is +calculated separately. With the design of our feature related distance +function, it is possible to track objects through longer period of occlusions, +while keeping the number of identity switches comparatively low. Extensive +experimental evaluation demonstrates notable improvement in tracking accuracy +and reliability, as evidenced by reduced identity switches and enhanced +occlusion handling. These advancements not only contribute to the state of the +art in object tracking but also open new avenues for future research and +practical applications demanding high precision and reliability. + +
+
+
+
+
+ + ☆ ArAIEval Shared Task: Propagandistic Techniques Detection in Unimodal + and Multimodal Arabic Content + + +
+ We present an overview of the second edition of the ArAIEval shared task, +organized as part of the ArabicNLP 2024 conference co-located with ACL 2024. In +this edition, ArAIEval offers two tasks: (i) detection of propagandistic +textual spans with persuasion techniques identification in tweets and news +articles, and (ii) distinguishing between propagandistic and non-propagandistic +memes. A total of 14 teams participated in the final evaluation phase, with 6 +and 9 teams participating in Tasks 1 and 2, respectively. Finally, 11 teams +submitted system description papers. Across both tasks, we observed that +fine-tuning transformer models such as AraBERT was at the core of the majority +of the participating systems. We provide a description of the task setup, +including a description of the dataset construction and the evaluation setup. +We further provide a brief overview of the participating systems. All datasets +and evaluation scripts are released to the research community +(https://araieval.gitlab.io/). We hope this will enable further research on +these important tasks in Arabic. + +
+
+ comment: propaganda, span detection, disinformation, misinformation, fake + news, LLMs, GPT-4, multimodality, multimodal LLMs +
+
+
+
+
+ + ☆ Every Pixel Has its Moments: Ultra-High-Resolution Unpaired + Image-to-Image Translation via Dense Normalization ECCV 2024 + + +
+ Recent advancements in ultra-high-resolution unpaired image-to-image +translation have aimed to mitigate the constraints imposed by limited GPU +memory through patch-wise inference. Nonetheless, existing methods often +compromise between the reduction of noticeable tiling artifacts and the +preservation of color and hue contrast, attributed to the reliance on global +image- or patch-level statistics in the instance normalization layers. In this +study, we introduce a Dense Normalization (DN) layer designed to estimate +pixel-level statistical moments. This approach effectively diminishes tiling +artifacts while concurrently preserving local color and hue contrasts. To +address the computational demands of pixel-level estimation, we further propose +an efficient interpolation algorithm. Moreover, we invent a parallelism +strategy that enables the DN layer to operate in a single pass. Through +extensive experiments, we demonstrate that our method surpasses all existing +approaches in performance. Notably, our DN layer is hyperparameter-free and can +be seamlessly integrated into most unpaired image-to-image translation +frameworks without necessitating retraining. Overall, our work paves the way +for future exploration in handling images of arbitrary resolutions within the +realm of unpaired image-to-image translation. Code is available at: +https://github.com/Kaminyou/Dense-Normalization. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Exploration of Class Center for Fine-Grained Visual Classification + + +
+ Different from large-scale classification tasks, fine-grained visual +classification is a challenging task due to two critical problems: 1) evident +intra-class variances and subtle inter-class differences, and 2) overfitting +owing to fewer training samples in datasets. Most existing methods extract key +features to reduce intra-class variances, but pay no attention to subtle +inter-class differences in fine-grained visual classification. To address this +issue, we propose a loss function named exploration of class center, which +consists of a multiple class-center constraint and a class-center label +generation. This loss function fully utilizes the information of the class +center from the perspective of features and labels. From the feature +perspective, the multiple class-center constraint pulls samples closer to the +target class center, and pushes samples away from the most similar nontarget +class center. Thus, the constraint reduces intra-class variances and enlarges +inter-class differences. From the label perspective, the class-center label +generation utilizes classcenter distributions to generate soft labels to +alleviate overfitting. Our method can be easily integrated with existing +fine-grained visual classification approaches as a loss function, to further +boost excellent performance with only slight training costs. Extensive +experiments are conducted to demonstrate consistent improvements achieved by +our method on four widely-used fine-grained visual classification datasets. In +particular, our method achieves state-of-the-art performance on the +FGVC-Aircraft and CUB-200-2011 datasets. + +
+
+ comment: Accpeted by TCSVT. Code and trained models are + here:https://github.com/hyao1/ECC +
+
+
+
+
+ + ☆ Fine-grained Context and Multi-modal Alignment for Freehand 3D + Ultrasound Reconstruction MICCAI 2024 + + +
+ Fine-grained spatio-temporal learning is crucial for freehand 3D ultrasound +reconstruction. Previous works mainly resorted to the coarse-grained spatial +features and the separated temporal dependency learning and struggles for +fine-grained spatio-temporal learning. Mining spatio-temporal information in +fine-grained scales is extremely challenging due to learning difficulties in +long-range dependencies. In this context, we propose a novel method to exploit +the long-range dependency management capabilities of the state space model +(SSM) to address the above challenge. Our contribution is three-fold. First, we +propose ReMamba, which mines multi-scale spatio-temporal information by +devising a multi-directional SSM. Second, we propose an adaptive fusion +strategy that introduces multiple inertial measurement units as auxiliary +temporal information to enhance spatio-temporal perception. Last, we design an +online alignment strategy that encodes the temporal information as pseudo +labels for multi-modal alignment to further improve reconstruction performance. +Extensive experimental validations on two large-scale datasets show remarkable +improvement from our method over competitors. + +
+
+ comment: Accepted at MICCAI 2024. This is the submitted manuscript and the + preprint has not undergone peer review (when applicable) or any + post-submission improvements or corrections +
+
+
+
+
+ + ☆ AnySR: Realizing Image Super-Resolution as Any-Scale, Any-Resource + + +
+ In an effort to improve the efficiency and scalability of single-image +super-resolution (SISR) applications, we introduce AnySR, to rebuild existing +arbitrary-scale SR methods into any-scale, any-resource implementation. As a +contrast to off-the-shelf methods that solve SR tasks across various scales +with the same computing costs, our AnySR innovates in: 1) building +arbitrary-scale tasks as any-resource implementation, reducing resource +requirements for smaller scales without additional parameters; 2) enhancing +any-scale performance in a feature-interweaving fashion, inserting scale pairs +into features at regular intervals and ensuring correct feature/scale +processing. The efficacy of our AnySR is fully demonstrated by rebuilding most +existing arbitrary-scale SISR methods and validating on five popular SISR test +datasets. The results show that our AnySR implements SISR tasks in a +computing-more-efficient fashion, and performs on par with existing +arbitrary-scale SISR methods. For the first time, we realize SISR tasks as not +only any-scale in literature, but also as any-resource. Code is available at +https://github.com/CrispyFeSo4/AnySR. + +
+
+
+
+
+ + ☆ GSD: View-Guided Gaussian Splatting Diffusion for 3D Reconstruction ECCV 2024 + + +
+ We present GSD, a diffusion model approach based on Gaussian Splatting (GS) +representation for 3D object reconstruction from a single view. Prior works +suffer from inconsistent 3D geometry or mediocre rendering quality due to +improper representations. We take a step towards resolving these shortcomings +by utilizing the recent state-of-the-art 3D explicit representation, Gaussian +Splatting, and an unconditional diffusion model. This model learns to generate +3D objects represented by sets of GS ellipsoids. With these strong generative +3D priors, though learning unconditionally, the diffusion model is ready for +view-guided reconstruction without further model fine-tuning. This is achieved +by propagating fine-grained 2D features through the efficient yet flexible +splatting function and the guided denoising sampling process. In addition, a 2D +diffusion model is further employed to enhance rendering fidelity, and improve +reconstructed GS quality by polishing and re-using the rendered images. The +final reconstructed objects explicitly come with high-quality 3D structure and +texture, and can be efficiently rendered in arbitrary views. Experiments on the +challenging real-world CO3D dataset demonstrate the superiority of our +approach. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ☆ Efficient GANs for Document Image Binarization Based on DWT and + Normalization + + +
+ For document image binarization task, generative adversarial networks (GANs) +can generate images where shadows and noise are effectively removed, which +allow for text information extraction. The current state-of-the-art (SOTA) +method proposes a three-stage network architecture that utilizes six GANs. +Despite its excellent model performance, the SOTA network architecture requires +long training and inference times. To overcome this problem, this work +introduces an efficient GAN method based on the three-stage network +architecture that incorporates the Discrete Wavelet Transformation and +normalization to reduce the input image size, which in turns, decrease both +training and inference times. In addition, this work presents novel generators, +discriminators, and loss functions to improve the model's performance. +Experimental results show that the proposed method reduces the training time by +10% and the inference time by 26% when compared to the SOTA method while +maintaining the model performance at 73.79 of Avg-Score. Our implementation +code is available on GitHub at +https://github.com/RuiyangJu/Efficient_Document_Image_Binarization. + +
+
+
+
+
+ + ☆ A Physical Model-Guided Framework for Underwater Image Enhancement and + Depth Estimation + + +
+ Due to the selective absorption and scattering of light by diverse aquatic +media, underwater images usually suffer from various visual degradations. +Existing underwater image enhancement (UIE) approaches that combine underwater +physical imaging models with neural networks often fail to accurately estimate +imaging model parameters such as depth and veiling light, resulting in poor +performance in certain scenarios. To address this issue, we propose a physical +model-guided framework for jointly training a Deep Degradation Model (DDM) with +any advanced UIE model. DDM includes three well-designed sub-networks to +accurately estimate various imaging parameters: a veiling light estimation +sub-network, a factors estimation sub-network, and a depth estimation +sub-network. Based on the estimated parameters and the underwater physical +imaging model, we impose physical constraints on the enhancement process by +modeling the relationship between underwater images and desired clean images, +i.e., outputs of the UIE model. Moreover, while our framework is compatible +with any UIE model, we design a simple yet effective fully convolutional UIE +model, termed UIEConv. UIEConv utilizes both global and local features for +image enhancement through a dual-branch structure. UIEConv trained within our +framework achieves remarkable enhancement results across diverse underwater +scenes. Furthermore, as a byproduct of UIE, the trained depth estimation +sub-network enables accurate underwater scene depth estimation. Extensive +experiments conducted in various real underwater imaging scenarios, including +deep-sea environments with artificial light sources, validate the effectiveness +of our framework and the UIEConv model. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Batch Transformer: Look for Attention in Batch + + +
+ Facial expression recognition (FER) has received considerable attention in +computer vision, with "in-the-wild" environments such as human-computer +interaction. However, FER images contain uncertainties such as occlusion, low +resolution, pose variation, illumination variation, and subjectivity, which +includes some expressions that do not match the target label. Consequently, +little information is obtained from a noisy single image and it is not trusted. +This could significantly degrade the performance of the FER task. To address +this issue, we propose a batch transformer (BT), which consists of the proposed +class batch attention (CBA) module, to prevent overfitting in noisy data and +extract trustworthy information by training on features reflected from several +images in a batch, rather than information from a single image. We also propose +multi-level attention (MLA) to prevent overfitting the specific features by +capturing correlations between each level. In this paper, we present a batch +transformer network (BTN) that combines the above proposals. Experimental +results on various FER benchmark datasets show that the proposed BTN +consistently outperforms the state-ofthe-art in FER datasets. Representative +results demonstrate the promise of the proposed BTN for FER. + +
+
+
+
+
+ + ☆ T2IShield: Defending Against Backdoors on Text-to-Image Diffusion Models ECCV2024 + + +
+ While text-to-image diffusion models demonstrate impressive generation +capabilities, they also exhibit vulnerability to backdoor attacks, which +involve the manipulation of model outputs through malicious triggers. In this +paper, for the first time, we propose a comprehensive defense method named +T2IShield to detect, localize, and mitigate such attacks. Specifically, we find +the "Assimilation Phenomenon" on the cross-attention maps caused by the +backdoor trigger. Based on this key insight, we propose two effective backdoor +detection methods: Frobenius Norm Threshold Truncation and Covariance +Discriminant Analysis. Besides, we introduce a binary-search approach to +localize the trigger within a backdoor sample and assess the efficacy of +existing concept editing methods in mitigating backdoor attacks. Empirical +evaluations on two advanced backdoor attack scenarios show the effectiveness of +our proposed defense method. For backdoor sample detection, T2IShield achieves +a detection F1 score of 88.9$\%$ with low computational cost. Furthermore, +T2IShield achieves a localization F1 score of 86.4$\%$ and invalidates 99$\%$ +poisoned samples. Codes are released at https://github.com/Robin-WZQ/T2IShield. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ AMD: Automatic Multi-step Distillation of Large-scale Vision Models + + +
+ Transformer-based architectures have become the de-facto standard models for +diverse vision tasks owing to their superior performance. As the size of the +models continues to scale up, model distillation becomes extremely important in +various real applications, particularly on devices limited by computational +resources. However, prevailing knowledge distillation methods exhibit +diminished efficacy when confronted with a large capacity gap between the +teacher and the student, e.g, 10x compression rate. In this paper, we present a +novel approach named Automatic Multi-step Distillation (AMD) for large-scale +vision model compression. In particular, our distillation process unfolds +across multiple steps. Initially, the teacher undergoes distillation to form an +intermediate teacher-assistant model, which is subsequently distilled further +to the student. An efficient and effective optimization framework is introduced +to automatically identify the optimal teacher-assistant that leads to the +maximal student performance. We conduct extensive experiments on multiple image +classification datasets, including CIFAR-10, CIFAR-100, and ImageNet. The +findings consistently reveal that our approach outperforms several established +baselines, paving a path for future knowledge distillation methods on +large-scale vision models. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ☆ Elevating All Zero-Shot Sketch-Based Image Retrieval Through Multimodal + Prompt Learning ECCV 2024 + + +
+ We address the challenges inherent in sketch-based image retrieval (SBIR) +across various settings, including zero-shot SBIR, generalized zero-shot SBIR, +and fine-grained zero-shot SBIR, by leveraging the vision-language foundation +model, CLIP. While recent endeavors have employed CLIP to enhance SBIR, these +approaches predominantly follow uni-modal prompt processing and overlook to +fully exploit CLIP's integrated visual and textual capabilities. To bridge this +gap, we introduce SpLIP, a novel multi-modal prompt learning scheme designed to +operate effectively with frozen CLIP backbones. We diverge from existing +multi-modal prompting methods that either treat visual and textual prompts +independently or integrate them in a limited fashion, leading to suboptimal +generalization. SpLIP implements a bi-directional prompt-sharing strategy that +enables mutual knowledge exchange between CLIP's visual and textual encoders, +fostering a more cohesive and synergistic prompt processing mechanism that +significantly reduces the semantic gap between the sketch and photo embeddings. +In addition to pioneering multi-modal prompt learning, we propose two +innovative strategies for further refining the embedding space. The first is an +adaptive margin generation for the sketch-photo triplet loss, regulated by +CLIP's class textual embeddings. The second introduces a novel task, termed +conditional cross-modal jigsaw, aimed at enhancing fine-grained sketch-photo +alignment, by focusing on implicitly modelling the viable patch arrangement of +sketches using knowledge of unshuffled photos. Our comprehensive experimental +evaluations across multiple benchmarks demonstrate the superior performance of +SpLIP in all three SBIR scenarios. Code is available at +https://github.com/mainaksingha01/SpLIP. + +
+
+ comment: Accepted in ECCV 2024 +
+
+
+
+
+ + ☆ HCS-TNAS: Hybrid Constraint-driven Semi-supervised Transformer-NAS for + Ultrasound Image Segmentation + + +
+ Accurate ultrasound segmentation is pursued because it aids clinicians in +achieving a comprehensive diagnosis. Due to the presence of low image quality +and high costs associated with annotation, two primary concerns arise: (1) +enhancing the understanding of multi-scale features, and (2) improving the +resistance to data dependency. To mitigate these concerns, we propose HCS-TNAS, +a novel neural architecture search (NAS) method that automatically designs the +network. For the first concern, we employ multi-level searching encompassing +cellular, layer, and module levels. Specifically, we design an Efficient +NAS-ViT module that searches for multi-scale tokens in the vision Transformer +(ViT) to capture context and local information, rather than relying solely on +simple combinations of operations. For the second concern, we propose a hybrid +constraint-driven semi-supervised learning method that considers additional +network independence and incorporates contrastive loss in a NAS formulation. By +further developing a stage-wise optimization strategy, a rational network +structure can be identified. Extensive experiments on three publicly available +ultrasound image datasets demonstrate that HCS-TNAS effectively improves +segmentation accuracy and outperforms state-of-the-art methods. + +
+
+
+
+
+ + ☆ Computer Vision for Clinical Gait Analysis: A Gait Abnormality Video + Dataset + + +
+ Clinical gait analysis (CGA) using computer vision is an emerging field in +artificial intelligence that faces barriers of accessible, real-world data, and +clear task objectives. This paper lays the foundation for current developments +in CGA as well as vision-based methods and datasets suitable for gait analysis. +We introduce The Gait Abnormality in Video Dataset (GAVD) in response to our +review of over 150 current gait-related computer vision datasets, which +highlighted the need for a large and accessible gait dataset clinically +annotated for CGA. GAVD stands out as the largest video gait dataset, +comprising 1874 sequences of normal, abnormal and pathological gaits. +Additionally, GAVD includes clinically annotated RGB data sourced from publicly +available content on online platforms. It also encompasses over 400 subjects +who have undergone clinical grade visual screening to represent a diverse range +of abnormal gait patterns, captured in various settings, including hospital +clinics and urban uncontrolled outdoor environments. We demonstrate the +validity of the dataset and utility of action recognition models for CGA using +pretrained models Temporal Segment Networks(TSN) and SlowFast network to +achieve video abnormality detection of 94% and 92% respectively when tested on +GAVD dataset. A GitHub repository https://github.com/Rahmyyy/GAVD consisting of +convenient URL links, and clinically relevant annotation for CGA is provided +for over 450 online videos, featuring diverse subjects performing a range of +normal, pathological, and abnormal gait patterns. + +
+
+
+
+
+ + ♻ ☆ Generative Camera Dolly: Extreme Monocular Dynamic Novel View Synthesis ECCV 2024 + + +
+ Accurate reconstruction of complex dynamic scenes from just a single +viewpoint continues to be a challenging task in computer vision. Current +dynamic novel view synthesis methods typically require videos from many +different camera viewpoints, necessitating careful recording setups, and +significantly restricting their utility in the wild as well as in terms of +embodied AI applications. In this paper, we propose $\textbf{GCD}$, a +controllable monocular dynamic view synthesis pipeline that leverages +large-scale diffusion priors to, given a video of any scene, generate a +synchronous video from any other chosen perspective, conditioned on a set of +relative camera pose parameters. Our model does not require depth as input, and +does not explicitly model 3D scene geometry, instead performing end-to-end +video-to-video translation in order to achieve its goal efficiently. Despite +being trained on synthetic multi-view video data only, zero-shot real-world +generalization experiments show promising results in multiple domains, +including robotics, object permanence, and driving environments. We believe our +framework can potentially unlock powerful applications in rich dynamic scene +understanding, perception for robotics, and interactive 3D video viewing +experiences for virtual reality. + +
+
+ comment: Accepted to ECCV 2024. Project webpage is available at: + https://gcd.cs.columbia.edu/ +
+
+
+
+
+ + ♻ ☆ Research on target detection method of distracted driving behavior based + on improved YOLOv8 + + +
+ With the development of deep learning technology, the detection and +classification of distracted driving behaviour requires higher accuracy. +Existing deep learning-based methods are computationally intensive and +parameter redundant, limiting the efficiency and accuracy in practical +applications. To solve this problem, this study proposes an improved YOLOv8 +detection method based on the original YOLOv8 model by integrating the BoTNet +module, GAM attention mechanism and EIoU loss function. By optimising the +feature extraction and multi-scale feature fusion strategies, the training and +inference processes are simplified, and the detection accuracy and efficiency +are significantly improved. Experimental results show that the improved model +performs well in both detection speed and accuracy, with an accuracy rate of +99.4%, and the model is smaller and easy to deploy, which is able to identify +and classify distracted driving behaviours in real time, provide timely +warnings, and enhance driving safety. + +
+
+ comment: Major revision on content, no replacement available soon +
+
+
+
+
+ + ♻ ☆ From Pixel to Cancer: Cellular Automata in Computed Tomography MICCAI 2024 + + +
+ AI for cancer detection encounters the bottleneck of data scarcity, +annotation difficulty, and low prevalence of early tumors. Tumor synthesis +seeks to create artificial tumors in medical images, which can greatly +diversify the data and annotations for AI training. However, current tumor +synthesis approaches are not applicable across different organs due to their +need for specific expertise and design. This paper establishes a set of generic +rules to simulate tumor development. Each cell (pixel) is initially assigned a +state between zero and ten to represent the tumor population, and a tumor can +be developed based on three rules to describe the process of growth, invasion, +and death. We apply these three generic rules to simulate tumor +development--from pixel to cancer--using cellular automata. We then integrate +the tumor state into the original computed tomography (CT) images to generate +synthetic tumors across different organs. This tumor synthesis approach allows +for sampling tumors at multiple stages and analyzing tumor-organ interaction. +Clinically, a reader study involving three expert radiologists reveals that the +synthetic tumors and their developing trajectories are convincingly realistic. +Technically, we analyze and simulate tumor development at various stages using +9,262 raw, unlabeled CT images sourced from 68 hospitals worldwide. The +performance in segmenting tumors in the liver, pancreas, and kidneys exceeds +prevailing literature benchmarks, underlining the immense potential of tumor +synthesis, especially for earlier cancer detection. + The code and models are available at +https://github.com/MrGiovanni/Pixel2Cancer + +
+
+ comment: Early accepted to MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Z-Splat: Z-Axis Gaussian Splatting for Camera-Sonar Fusion + + +
+ Differentiable 3D-Gaussian splatting (GS) is emerging as a prominent +technique in computer vision and graphics for reconstructing 3D scenes. GS +represents a scene as a set of 3D Gaussians with varying opacities and employs +a computationally efficient splatting operation along with analytical +derivatives to compute the 3D Gaussian parameters given scene images captured +from various viewpoints. Unfortunately, capturing surround view ($360^{\circ}$ +viewpoint) images is impossible or impractical in many real-world imaging +scenarios, including underwater imaging, rooms inside a building, and +autonomous navigation. In these restricted baseline imaging scenarios, the GS +algorithm suffers from a well-known 'missing cone' problem, which results in +poor reconstruction along the depth axis. In this manuscript, we demonstrate +that using transient data (from sonars) allows us to address the missing cone +problem by sampling high-frequency data along the depth axis. We extend the +Gaussian splatting algorithms for two commonly used sonars and propose fusion +algorithms that simultaneously utilize RGB camera data and sonar data. Through +simulations, emulations, and hardware experiments across various imaging +scenarios, we show that the proposed fusion algorithms lead to significantly +better novel view synthesis (5 dB improvement in PSNR) and 3D geometry +reconstruction (60% lower Chamfer distance). + +
+
+
+
+
+ + ♻ ☆ Improving Semantic Correspondence with Viewpoint-Guided Spherical Maps + + +
+ Recent progress in self-supervised representation learning has resulted in +models that are capable of extracting image features that are not only +effective at encoding image level, but also pixel-level, semantics. These +features have been shown to be effective for dense visual semantic +correspondence estimation, even outperforming fully-supervised methods. +Nevertheless, current self-supervised approaches still fail in the presence of +challenging image characteristics such as symmetries and repeated parts. To +address these limitations, we propose a new approach for semantic +correspondence estimation that supplements discriminative self-supervised +features with 3D understanding via a weak geometric spherical prior. Compared +to more involved 3D pipelines, our model only requires weak viewpoint +information, and the simplicity of our spherical representation enables us to +inject informative geometric priors into the model during training. We propose +a new evaluation metric that better accounts for repeated part and +symmetry-induced mistakes. We present results on the challenging SPair-71k +dataset, where we show that our approach demonstrates is capable of +distinguishing between symmetric views and repeated parts across many object +categories, and also demonstrate that we can generalize to unseen classes on +the AwA dataset. + +
+
+
+
+
+ + ♻ ☆ Generative Adversarial Networks for Spatio-Spectral Compression of + Hyperspectral Images + + +
+ The development of deep learning-based models for the compression of +hyperspectral images (HSIs) has recently attracted great attention in remote +sensing due to the sharp growing of hyperspectral data archives. Most of the +existing models achieve either spectral or spatial compression, and do not +jointly consider the spatio-spectral redundancies present in HSIs. To address +this problem, in this paper we focus our attention on the High Fidelity +Compression (HiFiC) model (which is proven to be highly effective for spatial +compression problems) and adapt it to perform spatio-spectral compression of +HSIs. In detail, we introduce two new models: i) HiFiC using Squeeze and +Excitation (SE) blocks (denoted as HiFiC$_{SE}$); and ii) HiFiC with 3D +convolutions (denoted as HiFiC$_{3D}$) in the framework of compression of HSIs. +We analyze the effectiveness of HiFiC$_{SE}$ and HiFiC$_{3D}$ in compressing +the spatio-spectral redundancies with channel attention and inter-dependency +analysis. Experimental results show the efficacy of the proposed models in +performing spatio-spectral compression, while reconstructing images at reduced +bitrates with higher reconstruction quality. The code of the proposed models is +publicly available at https://git.tu-berlin.de/rsim/HSI-SSC . + +
+
+
+
+
+ + ♻ ☆ Learning from Mistakes: Iterative Prompt Relabeling for Text-to-Image + Diffusion Model Training + + +
+ Diffusion models have shown impressive performance in many domains, including +image generation, time series prediction, and reinforcement learning. The +algorithm demonstrates superior performance over the traditional GAN and +transformer-based methods. However, the model's capability to follow natural +language instructions (e.g., spatial relationships between objects, generating +complex scenes) is still unsatisfactory. It has been an important research area +to enhance such capability. Prior works have shown that using Reinforcement +Learning can effectively train diffusion models to enhance fidelity on specific +objectives. However, existing RL methods require collecting a large amount of +data to train an effective reward model. They also don't receive feedback when +the generated image is incorrect. In this work, we propose Iterative Prompt +Relabeling (IPR), a novel algorithm that aligns images to text through +iterative image sampling and prompt relabeling. IPR first samples a batch of +images conditioned on the text then relabels the text prompts of unmatched +text-image pairs with classifier feedback. We conduct thorough experiments on +SDv2 and SDXL, testing their capability to follow instructions on spatial +relations. With IPR, we improved up to 15.22% (absolute improvement) on the +challenging spatial relation VISOR benchmark, demonstrating superior +performance compared to previous RL methods. + +
+
+
+
+
+ + ♻ ☆ A Lightweight Video Anomaly Detection Model with Weak Supervision and + Adaptive Instance Selection + + +
+ Video anomaly detection is to determine whether there are any abnormal +events, behaviors or objects in a given video, which enables effective and +intelligent public safety management. As video anomaly labeling is both +time-consuming and expensive, most existing works employ unsupervised or weakly +supervised learning methods. This paper focuses on weakly supervised video +anomaly detection, in which the training videos are labeled whether or not they +contain any anomalies, but there is no information about which frames the +anomalies are located. However, the uncertainty of weakly labeled data and the +large model size prevent existing methods from wide deployment in real +scenarios, especially the resource-limit situations such as edge-computing. In +this paper, we develop a lightweight video anomaly detection model. On the one +hand, we propose an adaptive instance selection strategy, which is based on the +model's current status to select confident instances, thereby mitigating the +uncertainty of weakly labeled data and subsequently promoting the model's +performance. On the other hand, we design a lightweight multi-level temporal +correlation attention module and an hourglass-shaped fully connected layer to +construct the model, which can reduce the model parameters to only 0.56\% of +the existing methods (e.g. RTFM). Our extensive experiments on two public +datasets UCF-Crime and ShanghaiTech show that our model can achieve comparable +or even superior AUC score compared to the state-of-the-art methods, with a +significantly reduced number of model parameters. + +
+
+
+
+
+ + ♻ ☆ Multimodal Variational Autoencoder for Low-cost Cardiac Hemodynamics + Instability Detection + + +
+ Recent advancements in non-invasive detection of cardiac hemodynamic +instability (CHDI) primarily focus on applying machine learning techniques to a +single data modality, e.g. cardiac magnetic resonance imaging (MRI). Despite +their potential, these approaches often fall short especially when the size of +labeled patient data is limited, a common challenge in the medical domain. +Furthermore, only a few studies have explored multimodal methods to study CHDI, +which mostly rely on costly modalities such as cardiac MRI and echocardiogram. +In response to these limitations, we propose a novel multimodal variational +autoencoder ($\text{CardioVAE}_\text{X,G}$) to integrate low-cost chest X-ray +(CXR) and electrocardiogram (ECG) modalities with pre-training on a large +unlabeled dataset. Specifically, $\text{CardioVAE}_\text{X,G}$ introduces a +novel tri-stream pre-training strategy to learn both shared and +modality-specific features, thus enabling fine-tuning with both unimodal and +multimodal datasets. We pre-train $\text{CardioVAE}_\text{X,G}$ on a large, +unlabeled dataset of $50,982$ subjects from a subset of MIMIC database and then +fine-tune the pre-trained model on a labeled dataset of $795$ subjects from the +ASPIRE registry. Comprehensive evaluations against existing methods show that +$\text{CardioVAE}_\text{X,G}$ offers promising performance (AUROC $=0.79$ and +Accuracy $=0.77$), representing a significant step forward in non-invasive +prediction of CHDI. Our model also excels in producing fine interpretations of +predictions directly associated with clinical features, thereby supporting +clinical decision-making. + +
+
+
+
+
+ + ♻ ☆ Planetary Causal Inference: Implications for the Geography of Poverty + + +
+ Earth observation data such as satellite imagery can, when combined with +machine learning, can have far-reaching impacts on our understanding of the +geography of poverty through the prediction of living conditions, especially +where government-derived economic indicators are either unavailable or +potentially untrustworthy. Recent work has progressed in using Earth +Observation (EO) data not only to predict spatial economic outcomes but also to +explore cause and effect, an understanding which is critical for downstream +policy analysis. In this review, we first document the growth of interest in +using satellite images together with EO data in causal analysis. We then trace +the relationship between spatial statistics and machine learning methods before +discussing four ways in which EO data has been used in causal machine learning +pipelines -- (1.) poverty outcome imputation for downstream causal analysis, +(2.) EO image deconfounding, (3.) EO-based treatment effect heterogeneity, and +(4.) EO-based transportability analysis. We conclude by providing a +step-by-step workflow for how researchers can incorporate EO data in causal ML +analysis going forward, outlining major choices of data, models, and evaluation +metrics. + +
+
+ comment: For a full list of the papers found in the quantitative literature + search, see https://github.com/AIandGlobalDevelopmentLab/eo-poverty-review +
+
+
+
+
+ + ♻ ☆ Low-Resource Crop Classification from Multi-Spectral Time Series Using + Lossless Compressors + + +
+ Deep learning has significantly improved the accuracy of crop classification +using multispectral temporal data. However, these models have complex +structures with numerous parameters, requiring large amounts of data and costly +training. In low-resource situations with fewer labeled samples, deep learning +models perform poorly due to insufficient data. Conversely, compressors are +data-type agnostic, and non-parametric methods do not bring underlying +assumptions. Inspired by this insight, we propose a non-training alternative to +deep learning models, aiming to address these situations. Specifically, the +Symbolic Representation Module is proposed to convert the reflectivity into +symbolic representations. The symbolic representations are then +cross-transformed in both the channel and time dimensions to generate symbolic +embeddings. Next, the Multi-scale Normalised Compression Distance (MNCD) is +designed to measure the correlation between any two symbolic embeddings. +Finally, based on the MNCDs, high quality crop classification can be achieved +using only a k-nearest-neighbor classifier kNN. The entire framework is +ready-to-use and lightweight. Without any training, it outperformed, on +average, 7 advanced deep learning models trained at scale on three benchmark +datasets. It also outperforms more than half of these models in the few-shot +setting with sparse crop labels. Therefore, the high performance and robustness +of our non-training framework makes it truly applicable to real-world crop +mapping. Codes are available at: +https://github.com/qinfengsama/Compressor-Based-Crop-Mapping. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Advanced Smart City Monitoring: Real-Time Identification of Indian + Citizen Attributes + + +
+ This project focuses on creating a smart surveillance system for Indian +cities that can identify and analyze people's attributes in real time. Using +advanced technologies like artificial intelligence and machine learning, the +system can recognize attributes such as upper body color, what the person is +wearing, accessories they are wearing, headgear, etc., and analyze behavior +through cameras installed around the city. + +
+
+ comment: 6 pages , 8 figure , changed title and some alignment issue were + resolved, but other contents remains same +
+
+
+
+
+ + ♻ ☆ Artwork Protection Against Neural Style Transfer Using Locally Adaptive + Adversarial Color Attack + + +
+ Neural style transfer (NST) generates new images by combining the style of +one image with the content of another. However, unauthorized NST can exploit +artwork, raising concerns about artists' rights and motivating the development +of proactive protection methods. We propose Locally Adaptive Adversarial Color +Attack (LAACA), empowering artists to protect their artwork from unauthorized +style transfer by processing before public release. By delving into the +intricacies of human visual perception and the role of different frequency +components, our method strategically introduces frequency-adaptive +perturbations in the image. These perturbations significantly degrade the +generation quality of NST while maintaining an acceptable level of visual +change in the original image, ensuring that potential infringers are +discouraged from using the protected artworks, because of its bad NST +generation quality. Additionally, existing metrics often overlook the +importance of color fidelity in evaluating color-mattered tasks, such as the +quality of NST-generated images, which is crucial in the context of artistic +works. To comprehensively assess the color-mattered tasks, we propose the +Adversarial Color Distance Metric (ACDM), designed to quantify the color +difference of images pre- and post-manipulations. Experimental results confirm +that attacking NST using LAACA results in visually inferior style transfer, and +the ACDM can efficiently measure color-mattered tasks. By providing artists +with a tool to safeguard their intellectual property, our work relieves the +socio-technical challenges posed by the misuse of NST in the art community. + +
+
+ comment: 9 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Graph Theory and GNNs to Unravel the Topographical Organization of Brain + Lesions in Variants of Alzheimer's Disease Progression + + +
+ In this study, we proposed and evaluated a graph-based framework to assess +variations in Alzheimer's disease (AD) neuropathologies, focusing on classic +(cAD) and rapid (rpAD) progression forms. Histopathological images are +converted into tau-pathology-based (i.e., amyloid plaques and tau tangles) +graphs, and derived metrics are used in a machine-learning classifier. This +classifier incorporates SHAP value explainability to differentiate between cAD +and rpAD. Furthermore, we tested graph neural networks (GNNs) to extract +topological embeddings from the graphs and use them in classifying the +progression forms of AD. The analysis demonstrated denser networks in rpAD and +a distinctive impact on brain cortical layers: rpAD predominantly affects +middle layers, whereas cAD influences both superficial and deep layers of the +same cortical regions. These results suggest a unique neuropathological network +organization for each AD variant. + +
+
+
+
+
+ + ♻ ☆ AnyMaker: Zero-shot General Object Customization via Decoupled + Dual-Level ID Injection + + +
+ Text-to-image based object customization, aiming to generate images with the +same identity (ID) as objects of interest in accordance with text prompts and +reference images, has made significant progress. However, recent customizing +research is dominated by specialized tasks, such as human customization or +virtual try-on, leaving a gap in general object customization. To this end, we +introduce AnyMaker, an innovative zero-shot object customization framework +capable of generating general objects with high ID fidelity and flexible text +editability. The efficacy of AnyMaker stems from its novel general ID +extraction, dual-level ID injection, and ID-aware decoupling. Specifically, the +general ID extraction module extracts sufficient ID information with an +ensemble of self-supervised models to tackle the diverse customization tasks +for general objects. Then, to provide the diffusion UNet with the extracted ID +as much while not damaging the text editability in the generation process, we +design a global-local dual-level ID injection module, in which the global-level +semantic ID is injected into text descriptions while the local-level ID details +are injected directly into the model through newly added cross-attention +modules. In addition, we propose an ID-aware decoupling module to disentangle +ID-related information from non-ID elements in the extracted representations +for high-fidelity generation of both identity and text descriptions. To +validate our approach and boost the research of general object customization, +we create the first large-scale general ID dataset, Multi-Category +ID-Consistent (MC-IDC) dataset, with 315k text-image samples and 10k +categories. Experiments show that AnyMaker presents remarkable performance in +general object customization and outperforms specialized methods in +corresponding tasks. Code and dataset will be released soon. + +
+
+
+
+
+ + ♻ ☆ MaTe3D: Mask-guided Text-based 3D-aware Portrait Editing + + +
+ 3D-aware portrait editing has a wide range of applications in multiple +fields. However, current approaches are limited due that they can only perform +mask-guided or text-based editing. Even by fusing the two procedures into a +model, the editing quality and stability cannot be ensured. To address this +limitation, we propose \textbf{MaTe3D}: mask-guided text-based 3D-aware +portrait editing. In this framework, first, we introduce a new SDF-based 3D +generator which learns local and global representations with proposed SDF and +density consistency losses. This enhances masked-based editing in local areas; +second, we present a novel distillation strategy: Conditional Distillation on +Geometry and Texture (CDGT). Compared to exiting distillation strategies, it +mitigates visual ambiguity and avoids mismatch between texture and geometry, +thereby producing stable texture and convincing geometry while editing. +Additionally, we create the CatMask-HQ dataset, a large-scale high-resolution +cat face annotation for exploration of model generalization and expansion. We +perform expensive experiments on both the FFHQ and CatMask-HQ datasets to +demonstrate the editing quality and stability of the proposed method. Our +method faithfully generates a 3D-aware edited face image based on a modified +mask and a text prompt. Our code and models will be publicly released. + +
+
+ comment: 16 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ EasyAnimate: A High-Performance Long Video Generation Method based on + Transformer Architecture + + +
+ This paper presents EasyAnimate, an advanced method for video generation that +leverages the power of transformer architecture for high-performance outcomes. +We have expanded the DiT framework originally designed for 2D image synthesis +to accommodate the complexities of 3D video generation by incorporating a +motion module block. It is used to capture temporal dynamics, thereby ensuring +the production of consistent frames and seamless motion transitions. The motion +module can be adapted to various DiT baseline methods to generate video with +different styles. It can also generate videos with different frame rates and +resolutions during both training and inference phases, suitable for both images +and videos. Moreover, we introduce slice VAE, a novel approach to condense the +temporal axis, facilitating the generation of long duration videos. Currently, +EasyAnimate exhibits the proficiency to generate videos with 144 frames. We +provide a holistic ecosystem for video production based on DiT, encompassing +aspects such as data pre-processing, VAE training, DiT models training (both +the baseline model and LoRA model), and end-to-end video inference. Code is +available at: https://github.com/aigc-apps/EasyAnimate. We are continuously +working to enhance the performance of our method. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Conceptual Codebook Learning for Vision-Language Models ECCV 2024 + + +
+ In this paper, we propose Conceptual Codebook Learning (CoCoLe), a novel +fine-tuning method for vision-language models (VLMs) to address the challenge +of improving the generalization capability of VLMs while fine-tuning them on +downstream tasks in a few-shot setting. We recognize that visual concepts, such +as textures, shapes, and colors are naturally transferable across domains and +play a crucial role in generalization tasks. Motivated by this interesting +finding, we learn a conceptual codebook consisting of visual concepts as keys +and conceptual prompts as values, which serves as a link between the image +encoder's outputs and the text encoder's inputs. Specifically, for a given +image, we leverage the codebook to identify the most relevant conceptual +prompts associated with the class embeddings to perform the classification. +Additionally, we incorporate a handcrafted concept cache as a regularization to +alleviate the overfitting issues in low-shot scenarios. We observe that this +conceptual codebook learning method is able to achieve enhanced alignment +between visual and linguistic modalities. Extensive experimental results +demonstrate that our CoCoLe method remarkably outperforms the existing +state-of-the-art methods across various evaluation settings, including +base-to-new generalization, cross-dataset evaluation, and domain generalization +tasks. Detailed ablation studies further confirm the efficacy of each component +in CoCoLe. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Multi-Attention Integrated Deep Learning Frameworks for Enhanced Breast + Cancer Segmentation and Identification + + +
+ Breast cancer poses a profound threat to lives globally, claiming numerous +lives each year. Therefore, timely detection is crucial for early intervention +and improved chances of survival. Accurately diagnosing and classifying breast +tumors using ultrasound images is a persistent challenge in medicine, demanding +cutting-edge solutions for improved treatment strategies. This research +introduces multiattention-enhanced deep learning (DL) frameworks designed for +the classification and segmentation of breast cancer tumors from ultrasound +images. A spatial channel attention mechanism is proposed for segmenting tumors +from ultrasound images, utilizing a novel LinkNet DL framework with an +InceptionResNet backbone. Following this, the paper proposes a deep +convolutional neural network with an integrated multi-attention framework +(DCNNIMAF) to classify the segmented tumor as benign, malignant, or normal. +From experimental results, it is observed that the segmentation model has +recorded an accuracy of 98.1%, with a minimal loss of 0.6%. It has also +achieved high Intersection over Union (IoU) and Dice Coefficient scores of +96.9% and 97.2%, respectively. Similarly, the classification model has attained +an accuracy of 99.2%, with a low loss of 0.31%. Furthermore, the classification +framework has achieved outstanding F1-Score, precision, and recall values of +99.1%, 99.3%, and 99.1%, respectively. By offering a robust framework for early +detection and accurate classification of breast cancer, this proposed work +significantly advances the field of medical image analysis, potentially +improving diagnostic precision and patient outcomes. + +
+
+ comment: 29 pages, 15 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ MUSES: The Multi-Sensor Semantic Perception Dataset for Driving under + Uncertainty + + +
+ Achieving level-5 driving automation in autonomous vehicles necessitates a +robust semantic visual perception system capable of parsing data from different +sensors across diverse conditions. However, existing semantic perception +datasets often lack important non-camera modalities typically used in +autonomous vehicles, or they do not exploit such modalities to aid and improve +semantic annotations in challenging conditions. To address this, we introduce +MUSES, the MUlti-SEnsor Semantic perception dataset for driving in adverse +conditions under increased uncertainty. MUSES includes synchronized multimodal +recordings with 2D panoptic annotations for 2500 images captured under diverse +weather and illumination. The dataset integrates a frame camera, a lidar, a +radar, an event camera, and an IMU/GNSS sensor. Our new two-stage panoptic +annotation protocol captures both class-level and instance-level uncertainty in +the ground truth and enables the novel task of uncertainty-aware panoptic +segmentation we introduce, along with standard semantic and panoptic +segmentation. MUSES proves both effective for training and challenging for +evaluating models under diverse visual conditions, and it opens new avenues for +research in multimodal and uncertainty-aware dense semantic perception. Our +dataset and benchmark are publicly available at +https://muses.vision.ee.ethz.ch. + +
+
+ comment: Dataset available at http://muses.vision.ee.ethz.ch +
+
+
+
+
+ + ♻ ☆ Source Prompt Disentangled Inversion for Boosting Image Editability with + Diffusion Models + + +
+ Text-driven diffusion models have significantly advanced the image editing +performance by using text prompts as inputs. One crucial step in text-driven +image editing is to invert the original image into a latent noise code +conditioned on the source prompt. While previous methods have achieved +promising results by refactoring the image synthesizing process, the inverted +latent noise code is tightly coupled with the source prompt, limiting the image +editability by target text prompts. To address this issue, we propose a novel +method called Source Prompt Disentangled Inversion (SPDInv), which aims at +reducing the impact of source prompt, thereby enhancing the text-driven image +editing performance by employing diffusion models. To make the inverted noise +code be independent of the given source prompt as much as possible, we indicate +that the iterative inversion process should satisfy a fixed-point constraint. +Consequently, we transform the inversion problem into a searching problem to +find the fixed-point solution, and utilize the pre-trained diffusion models to +facilitate the searching process. The experimental results show that our +proposed SPDInv method can effectively mitigate the conflicts between the +target editing prompt and the source prompt, leading to a significant decrease +in editing artifacts. In addition to text-driven image editing, with SPDInv we +can easily adapt customized image generation models to localized editing tasks +and produce promising performance. The source code are available at +https://github.com/leeruibin/SPDInv. + +
+
+
+
+
+ + ♻ ☆ UDA4Inst: Unsupervised Domain Adaptation for Instance Segmentation + + +
+ Unsupervised Domain Adaptation (UDA) aims to transfer knowledge learned from +a labeled source domain to an unlabeled target domain. While UDA methods for +synthetic to real-world domains (synth-to-real) show remarkable performance in +tasks such as semantic segmentation and object detection, very few were +proposed for instance segmentation in the field of vision-based autonomous +driving, and the existing ones are based on a suboptimal baseline, which +severely limits the performance. In this paper, we introduce UDA4Inst, a strong +baseline of synth-to-real UDA for instance segmentation. UDA4Inst adopts +cross-domain bidirectional data mixing at the instance level to effectively +utilize data from both source and target domains. Rare-class balancing and +category module training are also employed to further improve the performance. +It is worth noting that we are the first to demonstrate results on two new +synth-to-real instance segmentation benchmarks, with 39.0 mAP on +UrbanSyn->Cityscapes and 35.7 mAP on Synscapes->Cityscapes. Our method +outperforms the source-only Mask2Former model by +7 mAP and +7.6 mAP, +respectively. On SYNTHIA->Cityscapes, our method improves the source-only +Mask2Former by +6.7 mAP, achieving state-of-the-art results.Our code will be +released soon. + +
+
+
+
+
+ + ♻ ☆ UltraCortex: Submillimeter Ultra-High Field 9.4 T1 Brain MR Image + Collection and Manual Cortical Segmentations + + +
+ The UltraCortex repository (https://www.ultracortex.org) houses magnetic +resonance imaging data of the human brain obtained at an ultra-high field +strength of 9.4 T. It contains 86 structural MR images with spatial resolutions +ranging from 0.6 to 0.8 mm. Additionally, the repository includes segmentations +of 12 brains into gray and white matter compartments. These segmentations have +been independently validated by two expert neuroradiologists, thus establishing +them as a reliable gold standard. This resource provides researchers with +access to high-quality brain imaging data and validated segmentations, +facilitating neuroimaging studies and advancing our understanding of brain +structure and function. Existing repositories do not accommodate field +strengths beyond 7 T, nor do they offer validated segmentations, underscoring +the significance of this new resource. + +
+
+
+
+
+ + ♻ ☆ Read Between the Layers: Leveraging Multi-Layer Representations for + Rehearsal-Free Continual Learning with Pre-Trained Models + + +
+ We address the Continual Learning (CL) problem, wherein a model must learn a +sequence of tasks from non-stationary distributions while preserving prior +knowledge upon encountering new experiences. With the advancement of foundation +models, CL research has pivoted from the initial learning-from-scratch paradigm +towards utilizing generic features from large-scale pre-training. However, +existing approaches to CL with pre-trained models primarily focus on separating +class-specific features from the final representation layer and neglect the +potential of intermediate representations to capture low- and mid-level +features, which are more invariant to domain shifts. In this work, we propose +LayUP, a new prototype-based approach to CL that leverages second-order feature +statistics from multiple intermediate layers of a pre-trained network. Our +method is conceptually simple, does not require access to prior data, and works +out of the box with any foundation model. LayUP surpasses the state of the art +in four of the seven class-incremental learning benchmarks, all three +domain-incremental learning benchmarks and in six of the seven online continual +learning benchmarks, while significantly reducing memory and computational +requirements compared to existing baselines. Our results demonstrate that fully +exhausting the representational capacities of pre-trained models in CL goes +well beyond their final embeddings. + +
+
+ comment: Accepted for publication in Transactions of Machine Learning Research + (TMLR) journal +
+
+
+
+
+ + ♻ ☆ Multi-Modal Video Dialog State Tracking in the Wild ECCV 2024 + + +
+ We present MST-MIXER - a novel video dialog model operating over a generic +multi-modal state tracking scheme. Current models that claim to perform +multi-modal state tracking fall short of two major aspects: (1) They either +track only one modality (mostly the visual input) or (2) they target synthetic +datasets that do not reflect the complexity of real-world in the wild +scenarios. Our model addresses these two limitations in an attempt to close +this crucial research gap. Specifically, MST-MIXER first tracks the most +important constituents of each input modality. Then, it predicts the missing +underlying structure of the selected constituents of each modality by learning +local latent graphs using a novel multi-modal graph structure learning method. +Subsequently, the learned local graphs and features are parsed together to form +a global graph operating on the mix of all modalities which further refines its +structure and node embeddings. Finally, the fine-grained graph node features +are used to enhance the hidden states of the backbone Vision-Language Model +(VLM). MST-MIXER achieves new state-of-the-art results on five challenging +benchmarks. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Learning Rate Curriculum + + +
+ Most curriculum learning methods require an approach to sort the data samples +by difficulty, which is often cumbersome to perform. In this work, we propose a +novel curriculum learning approach termed Learning Rate Curriculum (LeRaC), +which leverages the use of a different learning rate for each layer of a neural +network to create a data-agnostic curriculum during the initial training +epochs. More specifically, LeRaC assigns higher learning rates to neural layers +closer to the input, gradually decreasing the learning rates as the layers are +placed farther away from the input. The learning rates increase at various +paces during the first training iterations, until they all reach the same +value. From this point on, the neural model is trained as usual. This creates a +model-level curriculum learning strategy that does not require sorting the +examples by difficulty and is compatible with any neural network, generating +higher performance levels regardless of the architecture. We conduct +comprehensive experiments on 12 data sets from the computer vision (CIFAR-10, +CIFAR-100, Tiny ImageNet, ImageNet-200, Food-101, UTKFace, PASCAL VOC), +language (BoolQ, QNLI, RTE) and audio (ESC-50, CREMA-D) domains, considering +various convolutional (ResNet-18, Wide-ResNet-50, DenseNet-121, YOLOv5), +recurrent (LSTM) and transformer (CvT, BERT, SepTr) architectures. We compare +our approach with the conventional training regime, as well as with Curriculum +by Smoothing (CBS), a state-of-the-art data-agnostic curriculum learning +approach. Unlike CBS, our performance improvements over the standard training +regime are consistent across all data sets and models. Furthermore, we +significantly surpass CBS in terms of training time (there is no additional +cost over the standard training regime for LeRaC). Our code is freely available +at: https://github.com/CroitoruAlin/LeRaC. + +
+
+ comment: Accepted at the International Journal of Computer Vision +
+
+
+
+
+ + ♻ ☆ Interpretable Representation Learning of Cardiac MRI via Attribute + Regularization + + +
+ Interpretability is essential in medical imaging to ensure that clinicians +can comprehend and trust artificial intelligence models. Several approaches +have been recently considered to encode attributes in the latent space to +enhance its interpretability. Notably, attribute regularization aims to encode +a set of attributes along the dimensions of a latent representation. However, +this approach is based on Variational AutoEncoder and suffers from blurry +reconstruction. In this paper, we propose an Attributed-regularized Soft +Introspective Variational Autoencoder that combines attribute regularization of +the latent space within the framework of an adversarially trained variational +autoencoder. We demonstrate on short-axis cardiac Magnetic Resonance images of +the UK Biobank the ability of the proposed method to address blurry +reconstruction issues of variational autoencoder methods while preserving the +latent space interpretability. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2312.08915 +
+
+
+
+
+ + ♻ ☆ FUTURE-AI: International consensus guideline for trustworthy and + deployable artificial intelligence in healthcare + + +
+ Despite major advances in artificial intelligence (AI) for medicine and +healthcare, the deployment and adoption of AI technologies remain limited in +real-world clinical practice. In recent years, concerns have been raised about +the technical, clinical, ethical and legal risks associated with medical AI. To +increase real world adoption, it is essential that medical AI tools are trusted +and accepted by patients, clinicians, health organisations and authorities. +This work describes the FUTURE-AI guideline as the first international +consensus framework for guiding the development and deployment of trustworthy +AI tools in healthcare. The FUTURE-AI consortium was founded in 2021 and +currently comprises 118 inter-disciplinary experts from 51 countries +representing all continents, including AI scientists, clinicians, ethicists, +and social scientists. Over a two-year period, the consortium defined guiding +principles and best practices for trustworthy AI through an iterative process +comprising an in-depth literature review, a modified Delphi survey, and online +consensus meetings. The FUTURE-AI framework was established based on 6 guiding +principles for trustworthy AI in healthcare, i.e. Fairness, Universality, +Traceability, Usability, Robustness and Explainability. Through consensus, a +set of 28 best practices were defined, addressing technical, clinical, legal +and socio-ethical dimensions. The recommendations cover the entire lifecycle of +medical AI, from design, development and validation to regulation, deployment, +and monitoring. FUTURE-AI is a risk-informed, assumption-free guideline which +provides a structured approach for constructing medical AI tools that will be +trusted, deployed and adopted in real-world practice. Researchers are +encouraged to take the recommendations into account in proof-of-concept stages +to facilitate future translation towards clinical practice of medical AI. + +
+
+
+
+
+ + ♻ ☆ Multi-Task Domain Adaptation for Language Grounding with 3D Objects + + +
+ The existing works on object-level language grounding with 3D objects mostly +focus on improving performance by utilizing the off-the-shelf pre-trained +models to capture features, such as viewpoint selection or geometric priors. +However, they have failed to consider exploring the cross-modal representation +of language-vision alignment in the cross-domain field. To answer this problem, +we propose a novel method called Domain Adaptation for Language Grounding +(DA4LG) with 3D objects. Specifically, the proposed DA4LG consists of a visual +adapter module with multi-task learning to realize vision-language alignment by +comprehensive multimodal feature representation. Experimental results +demonstrate that DA4LG competitively performs across visual and non-visual +language descriptions, independent of the completeness of observation. DA4LG +achieves state-of-the-art performance in the single-view setting and multi-view +setting with the accuracy of 83.8% and 86.8% respectively in the language +grounding benchmark SNARE. The simulation experiments show the well-practical +and generalized performance of DA4LG compared to the existing methods. Our +project is available at https://sites.google.com/view/da4lg. + +
+
+
+
+
+ + ♻ ☆ Explicit Abnormality Extraction for Unsupervised Motion Artifact + Reduction in Magnetic Resonance Imaging + + +
+ Motion artifacts compromise the quality of magnetic resonance imaging (MRI) +and pose challenges to achieving diagnostic outcomes and image-guided +therapies. In recent years, supervised deep learning approaches have emerged as +successful solutions for motion artifact reduction (MAR). One disadvantage of +these methods is their dependency on acquiring paired sets of motion +artifact-corrupted (MA-corrupted) and motion artifact-free (MA-free) MR images +for training purposes. Obtaining such image pairs is difficult and therefore +limits the application of supervised training. In this paper, we propose a +novel UNsupervised Abnormality Extraction Network (UNAEN) to alleviate this +problem. Our network is capable of working with unpaired MA-corrupted and +MA-free images. It converts the MA-corrupted images to MA-reduced images by +extracting abnormalities from the MA-corrupted images using a proposed artifact +extractor, which intercepts the residual artifact maps from the MA-corrupted MR +images explicitly, and a reconstructor to restore the original input from the +MA-reduced images. The performance of UNAEN was assessed by experimenting with +various publicly available MRI datasets and comparing them with +state-of-the-art methods. The quantitative evaluation demonstrates the +superiority of UNAEN over alternative MAR methods and visually exhibits fewer +residual artifacts. Our results substantiate the potential of UNAEN as a +promising solution applicable in real-world clinical environments, with the +capability to enhance diagnostic accuracy and facilitate image-guided +therapies. Our codes are publicly available at +https://github.com/YuSheng-Zhou/UNAEN. + +
+
+
+
+
+ + ♻ ☆ MRPD: Undersampled MRI reconstruction by prompting a large latent + diffusion model + + +
+ Implicit visual knowledge in a large latent diffusion model (LLDM) +pre-trained on natural images is rich and hypothetically universal to natural +and medical images. To test this hypothesis from a practical perspective, we +propose a novel framework for undersampled MRI Reconstruction by Prompting a +large latent Diffusion model (MRPD). While the existing methods trained on MRI +datasets are typically of limited generalizability toward diverse data +acquisition scenarios, MRPD supports unsupervised and universally adaptive MRI +reconstruction. For unsupervised reconstruction, MRSampler guides LLDM with a +random-phase-modulated hard-to-soft control. With any single- or +multiple-source MRI dataset, MRPD's performance is boosted universally by a +lightweight MRAdapter that only finetunes the LLDM's autoencoder. Experiments +on FastMRI and IXI show that MRPD is the only model that supports both MRI +database-free and database-available scenarios and attains the best +generalizability towards out-of-domain (OOD) samplings, contrasts, and organs +among compared unsupervised, supervised, and MRI diffusion methods. To our +knowledge, MRPD is the first method that empirically shows the universal +prowess of an LLDM pre-trained on vast natural images for MRI. Our official +implementation is at https://github.com/Z7Gao/MRPD. + +
+
+ comment: 10 pages, 5 figures, 7 tables, 1 pseudocode +
+
+
+
+
+ + ♻ ☆ Latent Fingerprint Matching via Dense Minutia Descriptor + + +
+ Latent fingerprint matching is a daunting task, primarily due to the poor +quality of latent fingerprints. In this study, we propose a deep-learning based +dense minutia descriptor (DMD) for latent fingerprint matching. A DMD is +obtained by extracting the fingerprint patch aligned by its central minutia, +capturing detailed minutia information and texture information. Our dense +descriptor takes the form of a three-dimensional representation, with two +dimensions associated with the original image plane and the other dimension +representing the abstract features. Additionally, the extraction process +outputs the fingerprint segmentation map, ensuring that the descriptor is only +valid in the foreground region. The matching between two descriptors occurs in +their overlapping regions, with a score normalization strategy to reduce the +impact brought by the differences outside the valid area. Our descriptor +achieves state-of-the-art performance on several latent fingerprint datasets. +Overall, our DMD is more representative and interpretable compared to previous +methods. + +
+
+ comment: accepted by IJCB 2024 +
+
+
+
+
+ + ♻ ☆ MT-HCCAR: Multi-Task Deep Learning with Hierarchical Classification and + Attention-based Regression for Cloud Property Retrieval ECML + + +
+ In the realm of Earth science, effective cloud property retrieval, +encompassing cloud masking, cloud phase classification, and cloud optical +thickness (COT) prediction, remains pivotal. Traditional methodologies +necessitate distinct models for each sensor instrument due to their unique +spectral characteristics. Recent strides in Earth Science research have +embraced machine learning and deep learning techniques to extract features from +satellite datasets' spectral observations. However, prevailing approaches lack +novel architectures accounting for hierarchical relationships among retrieval +tasks. Moreover, considering the spectral diversity among existing sensors, the +development of models with robust generalization capabilities over different +sensor datasets is imperative. Surprisingly, there is a dearth of methodologies +addressing the selection of an optimal model for diverse datasets. In response, +this paper introduces MT-HCCAR, an end-to-end deep learning model employing +multi-task learning to simultaneously tackle cloud masking, cloud phase +retrieval (classification tasks), and COT prediction (a regression task). The +MT-HCCAR integrates a hierarchical classification network (HC) and a +classification-assisted attention-based regression network (CAR), enhancing +precision and robustness in cloud labeling and COT prediction. Additionally, a +comprehensive model selection method rooted in K-fold cross-validation, one +standard error rule, and two introduced performance scores is proposed to +select the optimal model over three simulated satellite datasets OCI, VIIRS, +and ABI. The experiments comparing MT-HCCAR with baseline methods, the ablation +studies, and the model selection affirm the superiority and the generalization +capabilities of MT-HCCAR. + +
+
+ comment: 14 pages, 3 figures, accepted by ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Shedding the Bits: Pushing the Boundaries of Quantization with + Minifloats on FPGAs + + +
+ Post-training quantization (PTQ) is a powerful technique for model +compression, reducing the numerical precision in neural networks without +additional training overhead. Recent works have investigated adopting 8-bit +floating-point formats(FP8) in the context of PTQ for model inference. However, +floating-point formats smaller than 8 bits and their relative comparison in +terms of accuracy-hardware cost with integers remains unexplored on FPGAs. In +this work, we present minifloats, which are reduced-precision floating-point +formats capable of further reducing the memory footprint, latency, and energy +cost of a model while approaching full-precision model accuracy. We implement a +custom FPGA-based multiply-accumulate operator library and explore the vast +design space, comparing minifloat and integer representations across 3 to 8 +bits for both weights and activations. We also examine the applicability of +various integerbased quantization techniques to minifloats. Our experiments +show that minifloats offer a promising alternative for emerging workloads such +as vision transformers. + +
+
+ comment: Accepted in FPL (International Conference on Field-Programmable Logic + and Applications) 2024 conference. Revised with updated results +
+
+
+
+
+ + ♻ ☆ Rejuvenating image-GPT as Strong Visual Representation Learners ICML2024 + + +
+ This paper enhances image-GPT (iGPT), one of the pioneering works that +introduce autoregressive pretraining to predict the next pixels for visual +representation learning. Two simple yet essential changes are made. First, we +shift the prediction target from raw pixels to semantic tokens, enabling a +higher-level understanding of visual content. Second, we supplement the +autoregressive modeling by instructing the model to predict not only the next +tokens but also the visible tokens. This pipeline is particularly effective +when semantic tokens are encoded by discriminatively trained models, such as +CLIP. We introduce this novel approach as D-iGPT. Extensive experiments +showcase that D-iGPT excels as a strong learner of visual representations: A +notable achievement is its compelling performance on the ImageNet-1K dataset -- +by training on publicly available datasets, D-iGPT unprecedentedly achieves +\textbf{90.0\%} top-1 accuracy with a vanilla ViT-H. Additionally, D-iGPT shows +strong generalization on the downstream task. Code is available at +https://github.com/OliverRensu/D-iGPT. + +
+
+ comment: This paper is accepted by ICML2024 +
+
+
+
+
+ + ♻ ☆ Towards Multimodal Sentiment Analysis Debiasing via Bias Purification ECCV 2024 + + +
+ Multimodal Sentiment Analysis (MSA) aims to understand human intentions by +integrating emotion-related clues from diverse modalities, such as visual, +language, and audio. Unfortunately, the current MSA task invariably suffers +from unplanned dataset biases, particularly multimodal utterance-level label +bias and word-level context bias. These harmful biases potentially mislead +models to focus on statistical shortcuts and spurious correlations, causing +severe performance bottlenecks. To alleviate these issues, we present a +Multimodal Counterfactual Inference Sentiment (MCIS) analysis framework based +on causality rather than conventional likelihood. Concretely, we first +formulate a causal graph to discover harmful biases from already-trained +vanilla models. In the inference phase, given a factual multimodal input, MCIS +imagines two counterfactual scenarios to purify and mitigate these biases. +Then, MCIS can make unbiased decisions from biased observations by comparing +factual and counterfactual outcomes. We conduct extensive experiments on +several standard MSA benchmarks. Qualitative and quantitative results show the +effectiveness of the proposed framework. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ TalkingGaussian: Structure-Persistent 3D Talking Head Synthesis via + Gaussian Splatting ECCV 2024 + + +
+ Radiance fields have demonstrated impressive performance in synthesizing +lifelike 3D talking heads. However, due to the difficulty in fitting steep +appearance changes, the prevailing paradigm that presents facial motions by +directly modifying point appearance may lead to distortions in dynamic regions. +To tackle this challenge, we introduce TalkingGaussian, a deformation-based +radiance fields framework for high-fidelity talking head synthesis. Leveraging +the point-based Gaussian Splatting, facial motions can be represented in our +method by applying smooth and continuous deformations to persistent Gaussian +primitives, without requiring to learn the difficult appearance change like +previous methods. Due to this simplification, precise facial motions can be +synthesized while keeping a highly intact facial feature. Under such a +deformation paradigm, we further identify a face-mouth motion inconsistency +that would affect the learning of detailed speaking motions. To address this +conflict, we decompose the model into two branches separately for the face and +inside mouth areas, therefore simplifying the learning tasks to help +reconstruct more accurate motion and structure of the mouth region. Extensive +experiments demonstrate that our method renders high-quality lip-synchronized +talking head videos, with better facial fidelity and higher efficiency compared +with previous methods. + +
+
+ comment: Accepted at ECCV 2024. Project page: + https://fictionarry.github.io/TalkingGaussian/ +
+
+
+
+
+ + ♻ ☆ Parameter Efficient Fine-tuning of Self-supervised ViTs without + Catastrophic Forgetting CVPR + + +
+ Artificial neural networks often suffer from catastrophic forgetting, where +learning new concepts leads to a complete loss of previously acquired +knowledge. We observe that this issue is particularly magnified in vision +transformers (ViTs), where post-pre-training and fine-tuning on new tasks can +significantly degrade the model's original general abilities. For instance, a +DINO ViT-Base/16 pre-trained on ImageNet-1k loses over 70% accuracy on +ImageNet-1k after just 10 iterations of fine-tuning on CIFAR-100. Overcoming +this stability-plasticity dilemma is crucial for enabling ViTs to continuously +learn and adapt to new domains while preserving their initial knowledge. In +this work, we study two new parameter-efficient fine-tuning strategies: +(1)~Block Expansion, and (2) Low-rank adaptation (LoRA). Our experiments reveal +that using either Block Expansion or LoRA on self-supervised pre-trained ViTs +surpass fully fine-tuned ViTs in new domains while offering significantly +greater parameter efficiency. Notably, we find that Block Expansion experiences +only a minimal performance drop in the pre-training domain, thereby effectively +mitigating catastrophic forgetting in pre-trained ViTs. + +
+
+ comment: Accepted at eLVM Workshop, CVPR, 2024 +
+
+
+
+
+ + ♻ ☆ Spatial-Temporal Graph Enhanced DETR Towards Multi-Frame 3D Object + Detection + + +
+ The Detection Transformer (DETR) has revolutionized the design of CNN-based +object detection systems, showcasing impressive performance. However, its +potential in the domain of multi-frame 3D object detection remains largely +unexplored. In this paper, we present STEMD, a novel end-to-end framework that +enhances the DETR-like paradigm for multi-frame 3D object detection by +addressing three key aspects specifically tailored for this task. First, to +model the inter-object spatial interaction and complex temporal dependencies, +we introduce the spatial-temporal graph attention network, which represents +queries as nodes in a graph and enables effective modeling of object +interactions within a social context. To solve the problem of missing hard +cases in the proposed output of the encoder in the current frame, we +incorporate the output of the previous frame to initialize the query input of +the decoder. Finally, it poses a challenge for the network to distinguish +between the positive query and other highly similar queries that are not the +best match. And similar queries are insufficiently suppressed and turn into +redundant prediction boxes. To address this issue, our proposed IoU +regularization term encourages similar queries to be distinct during the +refinement. Through extensive experiments, we demonstrate the effectiveness of +our approach in handling challenging scenarios, while incurring only a minor +additional computational overhead. The code is publicly available at +https://github.com/Eaphan/STEMD. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Identification of Novel Modes in Generative Models via Fourier-based + Differential Clustering + + +
+ An interpretable comparison of generative models requires the identification +of sample types produced more frequently by each of the involved models. While +several quantitative scores have been proposed in the literature to rank +different generative models, such score-based evaluations do not reveal the +nuanced differences between the generative models in capturing various sample +types. In this work, we attempt to solve a differential clustering problem to +detect sample types expressed differently by two generative models. To solve +the differential clustering problem, we propose a method called Fourier-based +Identification of Novel Clusters (FINC) to identify modes produced by a +generative model with a higher frequency in comparison to a reference +distribution. FINC provides a scalable stochastic algorithm based on random +Fourier features to estimate the eigenspace of kernel covariance matrices of +two generative models and utilize the principal eigendirections to detect the +sample types present more dominantly in each model. We demonstrate the +application of the FINC method to large-scale computer vision datasets and +generative model frameworks. Our numerical results suggest the scalability of +the developed Fourier-based method in highlighting the sample types produced +with different frequencies by widely-used generative models. Code is available +at \url{https://github.com/buyeah1109/FINC} + +
+
+
+
+
+ + ♻ ☆ 360 in the Wild: Dataset for Depth Prediction and View Synthesis + + +
+ The large abundance of perspective camera datasets facilitated the emergence +of novel learning-based strategies for various tasks, such as camera +localization, single image depth estimation, or view synthesis. However, +panoramic or omnidirectional image datasets, including essential information, +such as pose and depth, are mostly made with synthetic scenes. In this work, we +introduce a large scale 360$^{\circ}$ videos dataset in the wild. This dataset +has been carefully scraped from the Internet and has been captured from various +locations worldwide. Hence, this dataset exhibits very diversified environments +(e.g., indoor and outdoor) and contexts (e.g., with and without moving +objects). Each of the 25K images constituting our dataset is provided with its +respective camera's pose and depth map. We illustrate the relevance of our +dataset for two main tasks, namely, single image depth estimation and view +synthesis. + +
+
+
+
+
+ + ♻ ☆ Attention-Challenging Multiple Instance Learning for Whole Slide Image + Classification ECCV2024 + + +
+ In the application of Multiple Instance Learning (MIL) methods for Whole +Slide Image (WSI) classification, attention mechanisms often focus on a subset +of discriminative instances, which are closely linked to overfitting. To +mitigate overfitting, we present Attention-Challenging MIL (ACMIL). ACMIL +combines two techniques based on separate analyses for attention value +concentration. Firstly, UMAP of instance features reveals various patterns +among discriminative instances, with existing attention mechanisms capturing +only some of them. To remedy this, we introduce Multiple Branch Attention (MBA) +to capture more discriminative instances using multiple attention branches. +Secondly, the examination of the cumulative value of Top-K attention scores +indicates that a tiny number of instances dominate the majority of attention. +In response, we present Stochastic Top-K Instance Masking (STKIM), which masks +out a portion of instances with Top-K attention values and allocates their +attention values to the remaining instances. The extensive experimental results +on three WSI datasets with two pre-trained backbones reveal that our ACMIL +outperforms state-of-the-art methods. Additionally, through heatmap +visualization and UMAP visualization, this paper extensively illustrates +ACMIL's effectiveness in suppressing attention value concentration and +overcoming the overfitting challenge. The source code is available at +\url{https://github.com/dazhangyu123/ACMIL}. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ HeartBeat: Towards Controllable Echocardiography Video Synthesis with + Multimodal Conditions-Guided Diffusion Models MICCAI 2024 + + +
+ Echocardiography (ECHO) video is widely used for cardiac examination. In +clinical, this procedure heavily relies on operator experience, which needs +years of training and maybe the assistance of deep learning-based systems for +enhanced accuracy and efficiency. However, it is challenging since acquiring +sufficient customized data (e.g., abnormal cases) for novice training and deep +model development is clinically unrealistic. Hence, controllable ECHO video +synthesis is highly desirable. In this paper, we propose a novel +diffusion-based framework named HeartBeat towards controllable and +high-fidelity ECHO video synthesis. Our highlight is three-fold. First, +HeartBeat serves as a unified framework that enables perceiving multimodal +conditions simultaneously to guide controllable generation. Second, we +factorize the multimodal conditions into local and global ones, with two +insertion strategies separately provided fine- and coarse-grained controls in a +composable and flexible manner. In this way, users can synthesize ECHO videos +that conform to their mental imagery by combining multimodal control signals. +Third, we propose to decouple the visual concepts and temporal dynamics +learning using a two-stage training scheme for simplifying the model training. +One more interesting thing is that HeartBeat can easily generalize to +mask-guided cardiac MRI synthesis in a few shots, showcasing its scalability to +broader applications. Extensive experiments on two public datasets show the +efficacy of the proposed HeartBeat. + +
+
+ comment: Accepted by MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ ASY-VRNet: Waterway Panoptic Driving Perception Model based on + Asymmetric Fair Fusion of Vision and 4D mmWave Radar IROS 2024 + + +
+ Panoptic Driving Perception (PDP) is critical for the autonomous navigation +of Unmanned Surface Vehicles (USVs). A PDP model typically integrates multiple +tasks, necessitating the simultaneous and robust execution of various +perception tasks to facilitate downstream path planning. The fusion of visual +and radar sensors is currently acknowledged as a robust and cost-effective +approach. However, most existing research has primarily focused on fusing +visual and radar features dedicated to object detection or utilizing a shared +feature space for multiple tasks, neglecting the individual representation +differences between various tasks. To address this gap, we propose a pair of +Asymmetric Fair Fusion (AFF) modules with favorable explainability designed to +efficiently interact with independent features from both visual and radar +modalities, tailored to the specific requirements of object detection and +semantic segmentation tasks. The AFF modules treat image and radar maps as +irregular point sets and transform these features into a crossed-shared feature +space for multitasking, ensuring equitable treatment of vision and radar point +cloud features. Leveraging AFF modules, we propose a novel and efficient PDP +model, ASY-VRNet, which processes image and radar features based on irregular +super-pixel point sets. Additionally, we propose an effective multitask +learning method specifically designed for PDP models. Compared to other +lightweight models, ASY-VRNet achieves state-of-the-art performance in object +detection, semantic segmentation, and drivable-area segmentation on the +WaterScenes benchmark. Our project is publicly available at +https://github.com/GuanRunwei/ASY-VRNet. + +
+
+ comment: Accepted by IROS 2024 +
+
+
+
+
+ + ♻ ☆ Instance-dependent Noisy-label Learning with Graphical Model Based + Noise-rate Estimation ECCV 2024 + + +
+ Deep learning faces a formidable challenge when handling noisy labels, as +models tend to overfit samples affected by label noise. This challenge is +further compounded by the presence of instance-dependent noise (IDN), a +realistic form of label noise arising from ambiguous sample information. To +address IDN, Label Noise Learning (LNL) incorporates a sample selection stage +to differentiate clean and noisy-label samples. This stage uses an arbitrary +criterion and a pre-defined curriculum that initially selects most samples as +noisy and gradually decreases this selection rate during training. Such +curriculum is sub-optimal since it does not consider the actual label noise +rate in the training set. This paper addresses this issue with a new noise-rate +estimation method that is easily integrated with most state-of-the-art (SOTA) +LNL methods to produce a more effective curriculum. Synthetic and real-world +benchmark results demonstrate that integrating our approach with SOTA LNL +methods improves accuracy in most cases. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Minimalist and High-Quality Panoramic Imaging with PSF-aware + Transformers + + +
+ High-quality panoramic images with a Field of View (FoV) of 360{\deg} are +essential for contemporary panoramic computer vision tasks. However, +conventional imaging systems come with sophisticated lens designs and heavy +optical components. This disqualifies their usage in many mobile and wearable +applications where thin and portable, minimalist imaging systems are desired. +In this paper, we propose a Panoramic Computational Imaging Engine (PCIE) to +achieve minimalist and high-quality panoramic imaging. With less than three +spherical lenses, a Minimalist Panoramic Imaging Prototype (MPIP) is +constructed based on the design of the Panoramic Annular Lens (PAL), but with +low-quality imaging results due to aberrations and small image plane size. We +propose two pipelines, i.e. Aberration Correction (AC) and Super-Resolution and +Aberration Correction (SR&AC), to solve the image quality problems of MPIP, +with imaging sensors of small and large pixel size, respectively. To leverage +the prior information of the optical system, we propose a Point Spread Function +(PSF) representation method to produce a PSF map as an additional modality. A +PSF-aware Aberration-image Recovery Transformer (PART) is designed as a +universal network for the two pipelines, in which the self-attention +calculation and feature extraction are guided by the PSF map. We train PART on +synthetic image pairs from simulation and put forward the PALHQ dataset to fill +the gap of real-world high-quality PAL images for low-level vision. A +comprehensive variety of experiments on synthetic and real-world benchmarks +demonstrates the impressive imaging results of PCIE and the effectiveness of +the PSF representation. We further deliver heuristic experimental findings for +minimalist and high-quality panoramic imaging. Our dataset and code will be +available at https://github.com/zju-jiangqi/PCIE-PART. + +
+
+ comment: Accepted to IEEE Transactions on Image Processing (TIP). The dataset + and code will be available at https://github.com/zju-jiangqi/PCIE-PART +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Optimizing Nepali PDF Extraction: A Comparative Study of Parser and OCR + Technologies + + +
+ This research compares PDF parsing and Optical Character Recognition (OCR) +methods for extracting Nepali content from PDFs. PDF parsing offers fast and +accurate extraction but faces challenges with non-Unicode Nepali fonts. OCR, +specifically PyTesseract, overcomes these challenges, providing versatility for +both digital and scanned PDFs. The study reveals that while PDF parsers are +faster, their accuracy fluctuates based on PDF types. In contrast, OCRs, with a +focus on PyTesseract, demonstrate consistent accuracy at the expense of +slightly longer extraction times. Considering the project's emphasis on Nepali +PDFs, PyTesseract emerges as the most suitable library, balancing extraction +speed and accuracy. + +
+
+
+
+
+ + ☆ VRSD: Rethinking Similarity and Diversity for Retrieval in Large + Language Models + + +
+ Vector retrieval algorithms are vital for semantic queries in the evolving +landscape of Large Language Models (LLMs). Retrieving vectors that +simultaneously meet criteria for both similarity and diversity significantly +enhances the capabilities of LLM-based agents. Despite the widespread use of +the Maximal Marginal Relevance (MMR) in retrieval scenarios with relevance and +diversity requirements, fluctuations caused by variations in the parameter $ +\lambda $ within the MMR complicate the determination of the optimization +trajectory in vector spaces, thus obscuring the direction of enhancement. +Moreover, there is a lack of a robust theoretical analysis for the constraints +of similarity and diversity in retrieval processes. This paper introduces a +novel approach to characterizing both constraints through the relationship +between the sum vector and the query vector. The proximity of these vectors +addresses the similarity constraint, while necessitating that individual +vectors within the sum vector divergently align with the query vector to +satisfy the diversity constraint. We also formulate a new combinatorial +optimization challenge, taking a selection of $k$ vectors from a set of +candidates such that their sum vector maximally aligns with the query vector, a +problem we demonstrate to be NP-complete. This establishes the profound +difficulty of pursuing similarity and diversity simultaneously in vector +retrieval and lays a theoretical groundwork for further research. Additionally, +we present the heuristic algorithm Vectors Retrieval with Similarity and +Diversity (VRSD) which not only has a definitive optimization goal and eschews +the need for preset parameters but also offers a modest reduction in time +complexity compared to MMR. Empirical validation further confirm that VRSD +significantly surpasses MMR across various datasets. + +
+
+
+
+
+ + ☆ GPT vs RETRO: Exploring the Intersection of Retrieval and + Parameter-Efficient Fine-Tuning + + +
+ Parameter-Efficient Fine-Tuning (PEFT) and Retrieval-Augmented Generation +(RAG) have become popular methods for adapting large language models while +minimizing compute requirements. In this paper, we apply PEFT methods +(P-tuning, Adapters, and LoRA) to a modified Retrieval-Enhanced Transformer +(RETRO) and a baseline GPT model across several sizes, ranging from 823 million +to 48 billion parameters. We show that RETRO models outperform GPT models in +zero-shot settings due to their unique pre-training process but GPT models have +higher performance potential with PEFT. Additionally, our study indicates that +8B parameter models strike an optimal balance between cost and performance and +P-tuning lags behind other PEFT techniques. We further provide a comparative +analysis of between applying PEFT to an Instruction-tuned RETRO model and base +RETRO model. This work presents the first comprehensive comparison of various +PEFT methods integrated with RAG, applied to both GPT and RETRO models, +highlighting their relative performance. + +
+
+
+
+
+ + ☆ EventChat: Implementation and user-centric evaluation of a large + language model-driven conversational recommender system for exploring leisure + events in an SME context + + +
+ Large language models (LLMs) present an enormous evolution in the strategic +potential of conversational recommender systems (CRS). Yet to date, research +has predominantly focused upon technical frameworks to implement LLM-driven +CRS, rather than end-user evaluations or strategic implications for firms, +particularly from the perspective of a small to medium enterprises (SME) that +makeup the bedrock of the global economy. In the current paper, we detail the +design of an LLM-driven CRS in an SME setting, and its subsequent performance +in the field using both objective system metrics and subjective user +evaluations. While doing so, we additionally outline a short-form revised +ResQue model for evaluating LLM-driven CRS, enabling replicability in a rapidly +evolving field. Our results reveal good system performance from a user +experience perspective (85.5% recommendation accuracy) but underscore latency, +cost, and quality issues challenging business viability. Notably, with a median +cost of $0.04 per interaction and a latency of 5.7s, cost-effectiveness and +response time emerge as crucial areas for achieving a more user-friendly and +economically viable LLM-driven CRS for SME settings. One major driver of these +costs is the use of an advanced LLM as a ranker within the retrieval-augmented +generation (RAG) technique. Our results additionally indicate that relying +solely on approaches such as Prompt-based learning with ChatGPT as the +underlying LLM makes it challenging to achieve satisfying quality in a +production environment. Strategic considerations for SMEs deploying an +LLM-driven CRS are outlined, particularly considering trade-offs in the current +technical landscape. + +
+
+ comment: 27 pages, 3 tables, 5 figures, pre-print manuscript +
+
+
+
+
+ + ☆ An Interactive Multi-modal Query Answering System with + Retrieval-Augmented Large Language Models VLDB 2024 + + +
+ Retrieval-augmented Large Language Models (LLMs) have reshaped traditional +query-answering systems, offering unparalleled user experiences. However, +existing retrieval techniques often struggle to handle multi-modal query +contexts. In this paper, we present an interactive Multi-modal Query Answering +(MQA) system, empowered by our newly developed multi-modal retrieval framework +and navigation graph index, integrated with cutting-edge LLMs. It comprises +five core components: Data Preprocessing, Vector Representation, Index +Construction, Query Execution, and Answer Generation, all orchestrated by a +dedicated coordinator to ensure smooth data flow from input to answer +generation. One notable aspect of MQA is its utilization of contrastive +learning to assess the significance of different modalities, facilitating +precise measurement of multi-modal information similarity. Furthermore, the +system achieves efficient retrieval through our advanced navigation graph +index, refined using computational pruning techniques. Another highlight of our +system is its pluggable processing framework, allowing seamless integration of +embedding models, graph indexes, and LLMs. This flexibility provides users +diverse options for gaining insights from their multi-modal knowledge base. A +preliminary video introduction of MQA is available at +https://youtu.be/xvUuo2ZIqWk. + +
+
+ comment: This demo paper has been accepted by VLDB 2024 +
+
+
+
+
+ + ♻ ☆ Simple Domain Adaptation for Sparse Retrievers ECIR 2024 + + +
+ In Information Retrieval, and more generally in Natural Language Processing, +adapting models to specific domains is conducted through fine-tuning. Despite +the successes achieved by this method and its versatility, the need for +human-curated and labeled data makes it impractical to transfer to new tasks, +domains, and/or languages when training data doesn't exist. Using the model +without training (zero-shot) is another option that however suffers an +effectiveness cost, especially in the case of first-stage retrievers. Numerous +research directions have emerged to tackle these issues, most of them in the +context of adapting to a task or a language. However, the literature is scarcer +for domain (or topic) adaptation. In this paper, we address this issue of +cross-topic discrepancy for a sparse first-stage retriever by transposing a +method initially designed for language adaptation. By leveraging pre-training +on the target data to learn domain-specific knowledge, this technique +alleviates the need for annotated data and expands the scope of domain +adaptation. Despite their relatively good generalization ability, we show that +even sparse retrievers can benefit from our simple domain adaptation method. + +
+
+ comment: Accepted at ECIR 2024 +
+
+
+
+
+ + ♻ ☆ Probabilistic Rank and Reward: A Scalable Model for Slate Recommendation + + +
+ We introduce Probabilistic Rank and Reward (PRR), a scalable probabilistic +model for personalized slate recommendation. Our approach allows off-policy +estimation of the reward in the scenario where the user interacts with at most +one item from a slate of K items. We show that the probability of a slate being +successful can be learned efficiently by combining the reward, whether the user +successfully interacted with the slate, and the rank, the item that was +selected within the slate. PRR outperforms existing off-policy reward +optimizing methods and is far more scalable to large action spaces. Moreover, +PRR allows fast delivery of recommendations powered by maximum inner product +search (MIPS), making it suitable in low latency domains such as computational +advertising. + +
+
+
+
+
+ + ♻ ☆ Which Neurons Matter in IR? Applying Integrated Gradients-based Methods + to Understand Cross-Encoders ICTIR 2024 + + +
+ With the recent addition of Retrieval-Augmented Generation (RAG), the scope +and importance of Information Retrieval (IR) has expanded. As a result, the +importance of a deeper understanding of IR models also increases. However, +interpretability in IR remains under-explored, especially when it comes to the +models' inner mechanisms. In this paper, we explore the possibility of adapting +Integrated Gradient-based methods in an IR context to identify the role of +individual neurons within the model. In particular, we provide new insights +into the role of what we call "relevance" neurons, as well as how they deal +with unseen data. Finally, we carry out an in-depth pruning study to validate +our findings. + +
+
+ comment: Accepted at ICTIR 2024 +
+
+
+
+
+ + ♻ ☆ Detecting LLM-Assisted Writing in Scientific Communication: Are We There + Yet? + + +
+ Large Language Models (LLMs), exemplified by ChatGPT, have significantly +reshaped text generation, particularly in the realm of writing assistance. +While ethical considerations underscore the importance of transparently +acknowledging LLM use, especially in scientific communication, genuine +acknowledgment remains infrequent. A potential avenue to encourage accurate +acknowledging of LLM-assisted writing involves employing automated detectors. +Our evaluation of four cutting-edge LLM-generated text detectors reveals their +suboptimal performance compared to a simple ad-hoc detector designed to +identify abrupt writing style changes around the time of LLM proliferation. We +contend that the development of specialized detectors exclusively dedicated to +LLM-assisted writing detection is necessary. Such detectors could play a +crucial role in fostering more authentic recognition of LLM involvement in +scientific communication, addressing the current challenges in acknowledgment +practices. + +
+
+
+
+
+ + ♻ ☆ Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and + unfairness in dyadic regression models + + +
+ Dyadic regression models, which predict real-valued outcomes for pairs of +entities, are fundamental in many domains (e.g. predicting the rating of a user +to a product in Recommender Systems) and promising and under exploration in +many others (e.g. approximating the adequate dosage of a drug for a patient in +personalized pharmacology). In this work, we demonstrate that non-uniformity in +the observed value distributions of individual entities leads to severely +biased predictions in state-of-the-art models, skewing predictions towards the +average of observed past values for the entity and providing worse-than-random +predictive power in eccentric yet equally important cases. We show that the +usage of global error metrics like Root Mean Squared Error (RMSE) and Mean +Absolute Error (MAE) is insufficient to capture this phenomenon, which we name +eccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as +a new complementary metric that can quantify it in all studied models and +datasets. We also prove the adequateness of EAUC by using naive de-biasing +corrections to demonstrate that a lower model bias correlates with a lower EAUC +and vice-versa. This work contributes a bias-aware evaluation of dyadic +regression models to avoid potential unfairness and risks in critical +real-world applications of such systems. + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Me, Myself, and AI: The Situational Awareness Dataset (SAD) for LLMs + + +
+ AI assistants such as ChatGPT are trained to respond to users by saying, "I +am a large language model". This raises questions. Do such models know that +they are LLMs and reliably act on this knowledge? Are they aware of their +current circumstances, such as being deployed to the public? We refer to a +model's knowledge of itself and its circumstances as situational awareness. To +quantify situational awareness in LLMs, we introduce a range of behavioral +tests, based on question answering and instruction following. These tests form +the $\textbf{Situational Awareness Dataset (SAD)}$, a benchmark comprising 7 +task categories and over 13,000 questions. The benchmark tests numerous +abilities, including the capacity of LLMs to (i) recognize their own generated +text, (ii) predict their own behavior, (iii) determine whether a prompt is from +internal evaluation or real-world deployment, and (iv) follow instructions that +depend on self-knowledge. + We evaluate 16 LLMs on SAD, including both base (pretrained) and chat models. +While all models perform better than chance, even the highest-scoring model +(Claude 3 Opus) is far from a human baseline on certain tasks. We also observe +that performance on SAD is only partially predicted by metrics of general +knowledge (e.g. MMLU). Chat models, which are finetuned to serve as AI +assistants, outperform their corresponding base models on SAD but not on +general knowledge tasks. The purpose of SAD is to facilitate scientific +understanding of situational awareness in LLMs by breaking it down into +quantitative abilities. Situational awareness is important because it enhances +a model's capacity for autonomous planning and action. While this has potential +benefits for automation, it also introduces novel risks related to AI safety +and control. Code and latest results available at +https://situational-awareness-dataset.org . + +
+
+ comment: 11 page main body, 98 page appendix, 58 figures +
+
+
+
+
+ + ☆ Missed Causes and Ambiguous Effects: Counterfactuals Pose Challenges for + Interpreting Neural Networks + + +
+ Interpretability research takes counterfactual theories of causality for +granted. Most causal methods rely on counterfactual interventions to inputs or +the activations of particular model components, followed by observations of the +change in models' output logits or behaviors. While this yields more faithful +evidence than correlational methods, counterfactuals nonetheless have key +problems that bias our findings in specific and predictable ways. Specifically, +(i) counterfactual theories do not effectively capture multiple independently +sufficient causes of the same effect, which leads us to miss certain causes +entirely; and (ii) counterfactual dependencies in neural networks are generally +not transitive, which complicates methods for extracting and interpreting +causal graphs from neural networks. We discuss the implications of these +challenges for interpretability researchers and propose concrete suggestions +for future work. + +
+
+
+
+
+ + ☆ Rethinking Visual Prompting for Multimodal Large Language Models with + External Knowledge + + +
+ In recent years, multimodal large language models (MLLMs) have made +significant strides by training on vast high-quality image-text datasets, +enabling them to generally understand images well. However, the inherent +difficulty in explicitly conveying fine-grained or spatially dense information +in text, such as masks, poses a challenge for MLLMs, limiting their ability to +answer questions requiring an understanding of detailed or localized visual +elements. Drawing inspiration from the Retrieval-Augmented Generation (RAG) +concept, this paper proposes a new visual prompt approach to integrate +fine-grained external knowledge, gleaned from specialized vision models (e.g., +instance segmentation/OCR models), into MLLMs. This is a promising yet +underexplored direction for enhancing MLLMs' performance. Our approach diverges +from concurrent works, which transform external knowledge into additional text +prompts, necessitating the model to indirectly learn the correspondence between +visual content and text coordinates. Instead, we propose embedding fine-grained +knowledge information directly into a spatial embedding map as a visual prompt. +This design can be effortlessly incorporated into various MLLMs, such as LLaVA +and Mipha, considerably improving their visual understanding performance. +Through rigorous experiments, we demonstrate that our method can enhance MLLM +performance across nine benchmarks, amplifying their fine-grained context-aware +capabilities. + +
+
+
+
+
+ + ☆ The diameter of a stochastic matrix: A new measure for sensitivity + analysis in Bayesian networks + + +
+ Bayesian networks are one of the most widely used classes of probabilistic +models for risk management and decision support because of their +interpretability and flexibility in including heterogeneous pieces of +information. In any applied modelling, it is critical to assess how robust the +inferences on certain target variables are to changes in the model. In Bayesian +networks, these analyses fall under the umbrella of sensitivity analysis, which +is most commonly carried out by quantifying dissimilarities using +Kullback-Leibler information measures. In this paper, we argue that robustness +methods based instead on the familiar total variation distance provide simple +and more valuable bounds on robustness to misspecification, which are both +formally justifiable and transparent. We introduce a novel measure of +dependence in conditional probability tables called the diameter to derive such +bounds. This measure quantifies the strength of dependence between a variable +and its parents. We demonstrate how such formal robustness considerations can +be embedded in building a Bayesian network. + +
+
+
+
+
+ + ☆ Unsupervised 4D Cardiac Motion Tracking with Spatiotemporal Optical Flow + Networks + + +
+ Cardiac motion tracking from echocardiography can be used to estimate and +quantify myocardial motion within a cardiac cycle. It is a cost-efficient and +effective approach for assessing myocardial function. However, ultrasound +imaging has the inherent characteristics of spatially low resolution and +temporally random noise, which leads to difficulties in obtaining reliable +annotation. Thus it is difficult to perform supervised learning for motion +tracking. In addition, there is no end-to-end unsupervised method currently in +the literature. This paper presents a motion tracking method where unsupervised +optical flow networks are designed with spatial reconstruction loss and +temporal-consistency loss. Our proposed loss functions make use of the +pair-wise and temporal correlation to estimate cardiac motion from noisy +background. Experiments using a synthetic 4D echocardiography dataset has shown +the effectiveness of our approach, and its superiority over existing methods on +both accuracy and running speed. To the best of our knowledge, this is the +first work performed that uses unsupervised end-to-end deep learning optical +flow network for 4D cardiac motion tracking. + +
+
+
+
+
+ + ☆ Multitaper mel-spectrograms for keyword spotting + + +
+ Keyword spotting (KWS) is one of the speech recognition tasks most sensitive +to the quality of the feature representation. However, the research on KWS has +traditionally focused on new model topologies, putting little emphasis on other +aspects like feature extraction. This paper investigates the use of the +multitaper technique to create improved features for KWS. The experimental +study is carried out for different test scenarios, windows and parameters, +datasets, and neural networks commonly used in embedded KWS applications. +Experiment results confirm the advantages of using the proposed improved +features. + +
+
+
+
+
+ + ☆ Lazarus: Resilient and Elastic Training of Mixture-of-Experts Models + with Adaptive Expert Placement + + +
+ Sparsely-activated Mixture-of-Experts (MoE) architecture has increasingly +been adopted to further scale large language models (LLMs) due to its +sub-linear scaling for computation costs. However, frequent failures still pose +significant challenges as training scales. The cost of even a single failure is +significant, as all GPUs need to wait idle until the failure is resolved, +potentially losing considerable training progress as training has to restart +from checkpoints. Existing solutions for efficient fault-tolerant training +either lack elasticity or rely on building resiliency into pipeline +parallelism, which cannot be applied to MoE models due to the expert +parallelism strategy adopted by the MoE architecture. + We present Lazarus, a system for resilient and elastic training of MoE +models. Lazarus adaptively allocates expert replicas to address the inherent +imbalance in expert workload and speeds-up training, while a provably optimal +expert placement algorithm is developed to maximize the probability of recovery +upon failures. Through adaptive expert placement and a flexible token +dispatcher, Lazarus can also fully utilize all available nodes after failures, +leaving no GPU idle. Our evaluation shows that Lazarus outperforms existing MoE +training systems by up to 5.7x under frequent node failures and 3.4x on a real +spot instance trace. + +
+
+
+
+
+ + ☆ An autoencoder for compressing angle-resolved photoemission spectroscopy + data + + +
+ Angle-resolved photoemission spectroscopy (ARPES) is a powerful experimental +technique to determine the electronic structure of solids. Advances in light +sources for ARPES experiments are currently leading to a vast increase of data +acquisition rates and data quantity. On the other hand, access time to the most +advanced ARPES instruments remains strictly limited, calling for fast, +effective, and on-the-fly data analysis tools to exploit this time. In response +to this need, we introduce ARPESNet, a versatile autoencoder network that +efficiently summmarises and compresses ARPES datasets. We train ARPESNet on a +large and varied dataset of 2-dimensional ARPES data extracted by cutting +standard 3-dimensional ARPES datasets along random directions in $\mathbf{k}$. +To test the data representation capacity of ARPESNet, we compare $k$-means +clustering quality between data compressed by ARPESNet, data compressed by +discrete cosine transform, and raw data, at different noise levels. ARPESNet +data excels in clustering quality despite its high compression ratio. + +
+
+
+
+
+ + ☆ On scalable oversight with weak LLMs judging strong LLMs + + +
+ Scalable oversight protocols aim to enable humans to accurately supervise +superhuman AI. In this paper we study debate, where two AI's compete to +convince a judge; consultancy, where a single AI tries to convince a judge that +asks questions; and compare to a baseline of direct question-answering, where +the judge just answers outright without the AI. We use large language models +(LLMs) as both AI agents and as stand-ins for human judges, taking the judge +models to be weaker than agent models. We benchmark on a diverse range of +asymmetries between judges and agents, extending previous work on a single +extractive QA task with information asymmetry, to also include mathematics, +coding, logic and multimodal reasoning asymmetries. We find that debate +outperforms consultancy across all tasks when the consultant is randomly +assigned to argue for the correct/incorrect answer. Comparing debate to direct +question answering, the results depend on the type of task: in extractive QA +tasks with information asymmetry debate outperforms direct question answering, +but in other tasks without information asymmetry the results are mixed. +Previous work assigned debaters/consultants an answer to argue for. When we +allow them to instead choose which answer to argue for, we find judges are less +frequently convinced by the wrong answer in debate than in consultancy. +Further, we find that stronger debater models increase judge accuracy, though +more modestly than in previous studies. + +
+
+ comment: 15 pages (53 including appendices) +
+
+
+
+
+ + ☆ Learning to (Learn at Test Time): RNNs with Expressive Hidden States + + +
+ Self-attention performs well in long context but has quadratic complexity. +Existing RNN layers have linear complexity, but their performance in long +context is limited by the expressive power of their hidden state. We propose a +new class of sequence modeling layers with linear complexity and an expressive +hidden state. The key idea is to make the hidden state a machine learning model +itself, and the update rule a step of self-supervised learning. Since the +hidden state is updated by training even on test sequences, our layers are +called Test-Time Training (TTT) layers. We consider two instantiations: +TTT-Linear and TTT-MLP, whose hidden state is a linear model and a two-layer +MLP respectively. We evaluate our instantiations at the scale of 125M to 1.3B +parameters, comparing with a strong Transformer and Mamba, a modern RNN. Both +TTT-Linear and TTT-MLP match or exceed the baselines. Similar to Transformer, +they can keep reducing perplexity by conditioning on more tokens, while Mamba +cannot after 16k context. With preliminary systems optimization, TTT-Linear is +already faster than Transformer at 8k context and matches Mamba in wall-clock +time. TTT-MLP still faces challenges in memory I/O, but shows larger potential +in long context, pointing to a promising direction for future research. + +
+
+
+
+
+ + ☆ Randomized Physics-Informed Neural Networks for Bayesian Data + Assimilation + + +
+ We propose a randomized physics-informed neural network (PINN) or rPINN +method for uncertainty quantification in inverse partial differential equation +(PDE) problems with noisy data. This method is used to quantify uncertainty in +the inverse PDE PINN solutions. Recently, the Bayesian PINN (BPINN) method was +proposed, where the posterior distribution of the PINN parameters was +formulated using the Bayes' theorem and sampled using approximate inference +methods such as the Hamiltonian Monte Carlo (HMC) and variational inference +(VI) methods. In this work, we demonstrate that HMC fails to converge for +non-linear inverse PDE problems. As an alternative to HMC, we sample the +distribution by solving the stochastic optimization problem obtained by +randomizing the PINN loss function. The effectiveness of the rPINN method is +tested for linear and non-linear Poisson equations, and the diffusion equation +with a high-dimensional space-dependent diffusion coefficient. The rPINN method +provides informative distributions for all considered problems. For the linear +Poisson equation, HMC and rPINN produce similar distributions, but rPINN is on +average 27 times faster than HMC. For the non-linear Poison and diffusion +equations, the HMC method fails to converge because a single HMC chain cannot +sample multiple modes of the posterior distribution of the PINN parameters in a +reasonable amount of time. + +
+
+ comment: 38 pages, 8 figures +
+
+
+
+
+ + ☆ Isomorphic Pruning for Vision Models + + +
+ Structured pruning reduces the computational overhead of deep neural networks +by removing redundant sub-structures. However, assessing the relative +importance of different sub-structures remains a significant challenge, +particularly in advanced vision models featuring novel mechanisms and +architectures like self-attention, depth-wise convolutions, or residual +connections. These heterogeneous substructures usually exhibit diverged +parameter scales, weight distributions, and computational topology, introducing +considerable difficulty to importance comparison. To overcome this, we present +Isomorphic Pruning, a simple approach that demonstrates effectiveness across a +range of network architectures such as Vision Transformers and CNNs, and +delivers competitive performance across different model sizes. Isomorphic +Pruning originates from an observation that, when evaluated under a pre-defined +importance criterion, heterogeneous sub-structures demonstrate significant +divergence in their importance distribution, as opposed to isomorphic +structures that present similar importance patterns. This inspires us to +perform isolated ranking and comparison on different types of sub-structures +for more reliable pruning. Our empirical results on ImageNet-1K demonstrate +that Isomorphic Pruning surpasses several pruning baselines dedicatedly +designed for Transformers or CNNs. For instance, we improve the accuracy of +DeiT-Tiny from 74.52% to 77.50% by pruning an off-the-shelf DeiT-Base model. +And for ConvNext-Tiny, we enhanced performance from 82.06% to 82.18%, while +reducing the number of parameters and memory usage. Code is available at +\url{https://github.com/VainF/Isomorphic-Pruning}. + +
+
+
+
+
+ + ☆ Linear causal disentanglement via higher-order cumulants + + +
+ Linear causal disentanglement is a recent method in causal representation +learning to describe a collection of observed variables via latent variables +with causal dependencies between them. It can be viewed as a generalization of +both independent component analysis and linear structural equation models. We +study the identifiability of linear causal disentanglement, assuming access to +data under multiple contexts, each given by an intervention on a latent +variable. We show that one perfect intervention on each latent variable is +sufficient and in the worst case necessary to recover parameters under perfect +interventions, generalizing previous work to allow more latent than observed +variables. We give a constructive proof that computes parameters via a coupled +tensor decomposition. For soft interventions, we find the equivalence class of +latent graphs and parameters that are consistent with observed data, via the +study of a system of polynomial equations. Our results hold assuming the +existence of non-zero higher-order cumulants, which implies non-Gaussianity of +variables. + +
+
+
+
+
+ + ☆ Understanding the Gains from Repeated Self-Distillation + + +
+ Self-Distillation is a special type of knowledge distillation where the +student model has the same architecture as the teacher model. Despite using the +same architecture and the same training data, self-distillation has been +empirically observed to improve performance, especially when applied +repeatedly. For such a process, there is a fundamental question of interest: +How much gain is possible by applying multiple steps of self-distillation? To +investigate this relative gain, we propose studying the simple but canonical +task of linear regression. Our analysis shows that the excess risk achieved by +multi-step self-distillation can significantly improve upon a single step of +self-distillation, reducing the excess risk by a factor as large as $d$, where +$d$ is the input dimension. Empirical results on regression tasks from the UCI +repository show a reduction in the learnt model's risk (MSE) by up to 47%. + +
+
+ comment: 31 pages, 10 figures +
+
+
+
+
+ + ☆ Proximal Point Method for Online Saddle Point Problem + + +
+ This paper focuses on the online saddle point problem, which involves a +sequence of two-player time-varying convex-concave games. Considering the +nonstationarity of the environment, we adopt the duality gap and the dynamic +Nash equilibrium regret as performance metrics for algorithm design. We present +three variants of the proximal point method: the Online Proximal Point +Method~(OPPM), the Optimistic OPPM~(OptOPPM), and the OptOPPM with multiple +predictors. Each algorithm guarantees upper bounds for both the duality gap and +dynamic Nash equilibrium regret, achieving near-optimality when measured +against the duality gap. Specifically, in certain benign environments, such as +sequences of stationary payoff functions, these algorithms maintain a nearly +constant metric bound. Experimental results further validate the effectiveness +of these algorithms. Lastly, this paper discusses potential reliability +concerns associated with using dynamic Nash equilibrium regret as a performance +metric. + +
+
+
+
+
+ + ☆ Remembering Everything Makes You Vulnerable: A Limelight on Machine + Unlearning for Personalized Healthcare Sector + + +
+ As the prevalence of data-driven technologies in healthcare continues to +rise, concerns regarding data privacy and security become increasingly +paramount. This thesis aims to address the vulnerability of personalized +healthcare models, particularly in the context of ECG monitoring, to +adversarial attacks that compromise patient privacy. We propose an approach +termed "Machine Unlearning" to mitigate the impact of exposed data points on +machine learning models, thereby enhancing model robustness against adversarial +attacks while preserving individual privacy. Specifically, we investigate the +efficacy of Machine Unlearning in the context of personalized ECG monitoring, +utilizing a dataset of clinical ECG recordings. Our methodology involves +training a deep neural classifier on ECG data and fine-tuning the model for +individual patients. We demonstrate the susceptibility of fine-tuned models to +adversarial attacks, such as the Fast Gradient Sign Method (FGSM), which can +exploit additional data points in personalized models. To address this +vulnerability, we propose a Machine Unlearning algorithm that selectively +removes sensitive data points from fine-tuned models, effectively enhancing +model resilience against adversarial manipulation. Experimental results +demonstrate the effectiveness of our approach in mitigating the impact of +adversarial attacks while maintaining the pre-trained model accuracy. + +
+
+ comment: 15 Pages, Exploring unlearning techniques on ECG Classifier +
+
+
+
+
+ + ☆ Multimodal Classification via Modal-Aware Interactive Enhancement + + +
+ Due to the notorious modality imbalance problem, multimodal learning (MML) +leads to the phenomenon of optimization imbalance, thus struggling to achieve +satisfactory performance. Recently, some representative methods have been +proposed to boost the performance, mainly focusing on adaptive adjusting the +optimization of each modality to rebalance the learning speed of dominant and +non-dominant modalities. To better facilitate the interaction of model +information in multimodal learning, in this paper, we propose a novel +multimodal learning method, called modal-aware interactive enhancement (MIE). +Specifically, we first utilize an optimization strategy based on sharpness +aware minimization (SAM) to smooth the learning objective during the forward +phase. Then, with the help of the geometry property of SAM, we propose a +gradient modification strategy to impose the influence between different +modalities during the backward phase. Therefore, we can improve the +generalization ability and alleviate the modality forgetting phenomenon +simultaneously for multimodal learning. Extensive experiments on widely used +datasets demonstrate that our proposed method can outperform various +state-of-the-art baselines to achieve the best performance. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models for Integrated + Satellite-Aerial-Terrestrial Networks: Recent Advances and Future Directions + + +
+ Integrated satellite, aerial, and terrestrial networks (ISATNs) represent a +sophisticated convergence of diverse communication technologies to ensure +seamless connectivity across different altitudes and platforms. This paper +explores the transformative potential of integrating Large Language Models +(LLMs) into ISATNs, leveraging advanced Artificial Intelligence (AI) and +Machine Learning (ML) capabilities to enhance these networks. We outline the +current architecture of ISATNs and highlight the significant role LLMs can play +in optimizing data flow, signal processing, and network management to advance +5G/6G communication technologies through advanced predictive algorithms and +real-time decision-making. A comprehensive analysis of ISATN components is +conducted, assessing how LLMs can effectively address traditional data +transmission and processing bottlenecks. The paper delves into the network +management challenges within ISATNs, emphasizing the necessity for +sophisticated resource allocation strategies, traffic routing, and security +management to ensure seamless connectivity and optimal performance under +varying conditions. Furthermore, we examine the technical challenges and +limitations associated with integrating LLMs into ISATNs, such as data +integration for LLM processing, scalability issues, latency in decision-making +processes, and the design of robust, fault-tolerant systems. The study also +identifies key future research directions for fully harnessing LLM capabilities +in ISATNs, which is crucial for enhancing network reliability, optimizing +performance, and achieving a truly interconnected and intelligent global +network system. + +
+
+
+
+
+ + ☆ GOALPlace: Begin with the End in Mind + + +
+ Co-optimizing placement with congestion is integral to achieving high-quality +designs. This paper presents GOALPlace, a new learning-based general approach +to improving placement congestion by controlling cell density. Our method +efficiently learns from an EDA tool's post-route optimized results and uses an +empirical Bayes technique to adapt this goal/target to a specific placer's +solutions, effectively beginning with the end in mind. It enhances correlation +with the long-running heuristics of the tool's router and timing-opt engine -- +while solving placement globally without expensive incremental congestion +estimation and mitigation methods. A statistical analysis with a new +hierarchical netlist clustering establishes the importance of density and the +potential for an adequate cell density target across placements. Our +experiments show that our method, integrated as a demonstration inside an +academic GPU-accelerated global placer, consistently produces macro and +standard cell placements of superior or comparable quality to commercial tools. +Our empirical Bayes methodology also allows a substantial quality improvement +over state-of-the-art academic mixed-size placers, achieving up to 10x fewer +design rule check (DRC) violations, a 5% decrease in wirelength, and a 30% and +60% reduction in worst and total negative slack (WNS/TNS). + +
+
+ comment: 10 pages, 7 figures, preprint +
+
+
+
+
+ + ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More + than Measuring Coherence, Grounding, and Repetition + + +
+ Visual storytelling consists in generating a natural language story given a +temporally ordered sequence of images. This task is not only challenging for +models, but also very difficult to evaluate with automatic metrics since there +is no consensus about what makes a story 'good'. In this paper, we introduce a +novel method that measures story quality in terms of human likeness regarding +three key aspects highlighted in previous work: visual grounding, coherence, +and repetitiveness. We then use this method to evaluate the stories generated +by several models, showing that the foundation model LLaVA obtains the best +result, but only slightly so compared to TAPM, a 50-times smaller visual +storytelling model. Upgrading the visual and language components of TAPM +results in a model that yields competitive performance with a relatively low +number of parameters. Finally, we carry out a human evaluation study, whose +results suggest that a 'good' story may require more than a human-like level of +visual grounding, coherence, and repetition. + +
+
+
+
+
+ + ☆ Structural Constraint Integration in Generative Model for Discovery of + Quantum Material Candidates + + +
+ Billions of organic molecules are known, but only a tiny fraction of the +functional inorganic materials have been discovered, a particularly relevant +problem to the community searching for new quantum materials. Recent +advancements in machine-learning-based generative models, particularly +diffusion models, show great promise for generating new, stable materials. +However, integrating geometric patterns into materials generation remains a +challenge. Here, we introduce Structural Constraint Integration in the +GENerative model (SCIGEN). Our approach can modify any trained generative +diffusion model by strategic masking of the denoised structure with a diffused +constrained structure prior to each diffusion step to steer the generation +toward constrained outputs. Furthermore, we mathematically prove that SCIGEN +effectively performs conditional sampling from the original distribution, which +is crucial for generating stable constrained materials. We generate eight +million compounds using Archimedean lattices as prototype constraints, with +over 10% surviving a multi-staged stability pre-screening. High-throughput +density functional theory (DFT) on 26,000 survived compounds shows that over +50% passed structural optimization at the DFT level. Since the properties of +quantum materials are closely related to geometric patterns, our results +indicate that SCIGEN provides a general framework for generating quantum +materials candidates. + +
+
+ comment: 512 pages total, 4 main figures + 218 supplementary figures +
+
+
+
+
+ + ☆ An AI Architecture with the Capability to Classify and Explain Hardware + Trojans + + +
+ Hardware trojan detection methods, based on machine learning (ML) techniques, +mainly identify suspected circuits but lack the ability to explain how the +decision was arrived at. An explainable methodology and architecture is +introduced based on the existing hardware trojan detection features. Results +are provided for explaining digital hardware trojans within a netlist using +trust-hub trojan benchmarks. + +
+
+
+
+
+ + ☆ Real-time Timbre Remapping with Differentiable DSP + + +
+ Timbre is a primary mode of expression in diverse musical contexts. However, +prevalent audio-driven synthesis methods predominantly rely on pitch and +loudness envelopes, effectively flattening timbral expression from the input. +Our approach draws on the concept of timbre analogies and investigates how +timbral expression from an input signal can be mapped onto controls for a +synthesizer. Leveraging differentiable digital signal processing, our method +facilitates direct optimization of synthesizer parameters through a novel +feature difference loss. This loss function, designed to learn relative timbral +differences between musical events, prioritizes the subtleties of graded timbre +modulations within phrases, allowing for meaningful translations in a timbre +space. Using snare drum performances as a case study, where timbral expression +is central, we demonstrate real-time timbre remapping from acoustic snare drums +to a differentiable synthesizer modeled after the Roland TR-808. + +
+
+ comment: Accepted for publication at the 24th International Conference on New + Interfaces for Musical Expression in Utrecht, Netherlands +
+
+
+
+
+ + ☆ Rethinking Image Compression on the Web with Generative AI + + +
+ The rapid growth of the Internet, driven by social media, web browsing, and +video streaming, has made images central to the Web experience, resulting in +significant data transfer and increased webpage sizes. Traditional image +compression methods, while reducing bandwidth, often degrade image quality. +This paper explores a novel approach using generative AI to reconstruct images +at the edge or client-side. We develop a framework that leverages text prompts +and provides additional conditioning inputs like Canny edges and color palettes +to a text-to-image model, achieving up to 99.8% bandwidth savings in the best +cases and 92.6% on average, while maintaining high perceptual similarity. +Empirical analysis and a user study show that our method preserves image +meaning and structure more effectively than traditional compression methods, +offering a promising solution for reducing bandwidth usage and improving +Internet affordability with minimal degradation in image quality. + +
+
+
+
+
+ + ☆ PoPreRo: A New Dataset for Popularity Prediction of Romanian Reddit + Posts ICPR 2024 + + +
+ We introduce PoPreRo, the first dataset for Popularity Prediction of Romanian +posts collected from Reddit. The PoPreRo dataset includes a varied compilation +of post samples from five distinct subreddits of Romania, totaling 28,107 data +samples. Along with our novel dataset, we introduce a set of competitive models +to be used as baselines for future research. Interestingly, the top-scoring +model achieves an accuracy of 61.35% and a macro F1 score of 60.60% on the test +set, indicating that the popularity prediction task on PoPreRo is very +challenging. Further investigations based on few-shot prompting the Falcon-7B +Large Language Model also point in the same direction. We thus believe that +PoPreRo is a valuable resource that can be used to evaluate models on +predicting the popularity of social media posts in Romanian. We release our +dataset at https://github.com/ana-rogoz/PoPreRo. + +
+
+ comment: Accepted at ICPR 2024 +
+
+
+
+
+ + ☆ Improved algorithms for learning quantum Hamiltonians, via flat + polynomials + + +
+ We give an improved algorithm for learning a quantum Hamiltonian given copies +of its Gibbs state, that can succeed at any temperature. Specifically, we +improve over the work of Bakshi, Liu, Moitra, and Tang [BLMT24], by reducing +the sample complexity and runtime dependence to singly exponential in the +inverse-temperature parameter, as opposed to doubly exponential. Our main +technical contribution is a new flat polynomial approximation to the +exponential function, with significantly lower degree than the flat polynomial +approximation used in [BLMT24]. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ PDiscoFormer: Relaxing Part Discovery Constraints with Vision + Transformers ECCV + + +
+ Computer vision methods that explicitly detect object parts and reason on +them are a step towards inherently interpretable models. Existing approaches +that perform part discovery driven by a fine-grained classification task make +very restrictive assumptions on the geometric properties of the discovered +parts; they should be small and compact. Although this prior is useful in some +cases, in this paper we show that pre-trained transformer-based vision models, +such as self-supervised DINOv2 ViT, enable the relaxation of these constraints. +In particular, we find that a total variation (TV) prior, which allows for +multiple connected components of any size, substantially outperforms previous +work. We test our approach on three fine-grained classification benchmarks: +CUB, PartImageNet and Oxford Flowers, and compare our results to previously +published methods as well as a re-implementation of the state-of-the-art method +PDiscoNet with a transformer-based backbone. We consistently obtain substantial +improvements across the board, both on part discovery metrics and the +downstream classification task, showing that the strong inductive biases in +self-supervised ViT models require to rethink the geometric priors that can be +used for unsupervised part discovery. + +
+
+ comment: Accepted as a main conference paper at the European Conference of + Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ☆ Introducing 'Inside' Out of Distribution + + +
+ Detecting and understanding out-of-distribution (OOD) samples is crucial in +machine learning (ML) to ensure reliable model performance. Current OOD +studies, in general, and in the context of ML, in particular, primarily focus +on extrapolatory OOD (outside), neglecting potential cases of interpolatory OOD +(inside). This study introduces a novel perspective on OOD by suggesting OOD +can be divided into inside and outside cases. In addition, following this +framework, we examine the inside-outside OOD profiles of datasets and their +impact on ML model performance. Our analysis shows that different +inside-outside OOD profiles lead to nuanced declines in ML model performance, +highlighting the importance of distinguishing between these two cases for +developing effective counter-OOD methods. + +
+
+
+
+
+ + ☆ GPT vs RETRO: Exploring the Intersection of Retrieval and + Parameter-Efficient Fine-Tuning + + +
+ Parameter-Efficient Fine-Tuning (PEFT) and Retrieval-Augmented Generation +(RAG) have become popular methods for adapting large language models while +minimizing compute requirements. In this paper, we apply PEFT methods +(P-tuning, Adapters, and LoRA) to a modified Retrieval-Enhanced Transformer +(RETRO) and a baseline GPT model across several sizes, ranging from 823 million +to 48 billion parameters. We show that RETRO models outperform GPT models in +zero-shot settings due to their unique pre-training process but GPT models have +higher performance potential with PEFT. Additionally, our study indicates that +8B parameter models strike an optimal balance between cost and performance and +P-tuning lags behind other PEFT techniques. We further provide a comparative +analysis of between applying PEFT to an Instruction-tuned RETRO model and base +RETRO model. This work presents the first comprehensive comparison of various +PEFT methods integrated with RAG, applied to both GPT and RETRO models, +highlighting their relative performance. + +
+
+
+
+
+ + ☆ Enhancing learning in artificial neural networks through cellular + heterogeneity and neuromodulatory signaling + + +
+ Recent progress in artificial intelligence (AI) has been driven by insights +from neuroscience, particularly with the development of artificial neural +networks (ANNs). This has significantly enhanced the replication of complex +cognitive tasks such as vision and natural language processing. Despite these +advances, ANNs struggle with continual learning, adaptable knowledge transfer, +robustness, and resource efficiency - capabilities that biological systems +handle seamlessly. Specifically, ANNs often overlook the functional and +morphological diversity of the brain, hindering their computational +capabilities. Furthermore, incorporating cell-type specific neuromodulatory +effects into ANNs with neuronal heterogeneity could enable learning at two +spatial scales: spiking behavior at the neuronal level, and synaptic plasticity +at the circuit level, thereby potentially enhancing their learning abilities. +In this article, we summarize recent bio-inspired models, learning rules and +architectures and propose a biologically-informed framework for enhancing ANNs. +Our proposed dual-framework approach highlights the potential of spiking neural +networks (SNNs) for emulating diverse spiking behaviors and dendritic +compartments to simulate morphological and functional diversity of neuronal +computations. Finally, we outline how the proposed approach integrates +brain-inspired compartmental models and task-driven SNNs, balances +bioinspiration and complexity, and provides scalable solutions for pressing AI +challenges, such as continual learning, adaptability, robustness, and +resource-efficiency. + +
+
+ comment: 34 pages, 4 figures, 3 boxes +
+
+
+
+
+ + ☆ Graph Reinforcement Learning in Power Grids: A Survey + + +
+ The challenges posed by renewable energy and distributed electricity +generation motivate the development of deep learning approaches to overcome the +lack of flexibility of traditional methods in power grids use cases. The +application of GNNs is particularly promising due to their ability to learn +from graph-structured data present in power grids. Combined with RL, they can +serve as control approaches to determine remedial grid actions. This review +analyses the ability of GRL to capture the inherent graph structure of power +grids to improve representation learning and decision making in different power +grid use cases. It distinguishes between common problems in transmission and +distribution grids and explores the synergy between RL and GNNs. In +transmission grids, GRL typically addresses automated grid management and +topology control, whereas on the distribution side, GRL concentrates more on +voltage regulation. We analyzed the selected papers based on their graph +structure and GNN model, the applied RL algorithm, and their overall +contributions. Although GRL demonstrate adaptability in the face of +unpredictable events and noisy or incomplete data, it primarily serves as a +proof of concept at this stage. There are multiple open challenges and +limitations that need to be addressed when considering the application of RL to +real power grid operation. + +
+
+
+
+
+ + ☆ Unified continuous-time q-learning for mean-field game and mean-field + control problems + + +
+ This paper studies the continuous-time q-learning in the mean-field +jump-diffusion models from the representative agent's perspective. To overcome +the challenge when the population distribution may not be directly observable, +we introduce the integrated q-function in decoupled form (decoupled +Iq-function) and establish its martingale characterization together with the +value function, which provides a unified policy evaluation rule for both +mean-field game (MFG) and mean-field control (MFC) problems. Moreover, +depending on the task to solve the MFG or MFC problem, we can employ the +decoupled Iq-function by different means to learn the mean-field equilibrium +policy or the mean-field optimal policy respectively. As a result, we devise a +unified q-learning algorithm for both MFG and MFC problems by utilizing all +test policies stemming from the mean-field interactions. For several examples +in the jump-diffusion setting, within and beyond the LQ framework, we can +obtain the exact parameterization of the decoupled Iq-functions and the value +functions, and illustrate our algorithm from the representative agent's +perspective with satisfactory performance. + +
+
+
+
+
+ + ☆ G-Adaptive mesh refinement -- leveraging graph neural networks and + differentiable finite element solvers + + +
+ We present a novel, and effective, approach to the long-standing problem of +mesh adaptivity in finite element methods (FEM). FE solvers are powerful tools +for solving partial differential equations (PDEs), but their cost and accuracy +are critically dependent on the choice of mesh points. To keep computational +costs low, mesh relocation (r-adaptivity) seeks to optimise the position of a +fixed number of mesh points to obtain the best FE solution accuracy. Classical +approaches to this problem require the solution of a separate nonlinear +"meshing" PDE to find the mesh point locations. This incurs significant cost at +remeshing and relies on certain a-priori assumptions and guiding heuristics for +optimal mesh point location. Recent machine learning approaches to r-adaptivity +have mainly focused on the construction of fast surrogates for such classical +methods. Our new approach combines a graph neural network (GNN) powered +architecture, with training based on direct minimisation of the FE solution +error with respect to the mesh point locations. The GNN employs graph neural +diffusion (GRAND), closely aligning the mesh solution space to that of +classical meshing methodologies, thus replacing heuristics with a learnable +strategy, and providing a strong inductive bias. This allows for rapid and +robust training and results in an extremely efficient and effective GNN +approach to online r-adaptivity. This method outperforms classical and prior ML +approaches to r-adaptive meshing on the test problems we consider, in +particular achieving lower FE solution error, whilst retaining the significant +speed-up over classical methods observed in prior ML work. + +
+
+
+
+
+ + ☆ LayerShuffle: Enhancing Robustness in Vision Transformers by Randomizing + Layer Execution Order + + +
+ Due to their architecture and how they are trained, artificial neural +networks are typically not robust toward pruning, replacing, or shuffling +layers at test time. However, such properties would be desirable for different +applications, such as distributed neural network architectures where the order +of execution cannot be guaranteed or parts of the network can fail during +inference. In this work, we address these issues through a number of proposed +training approaches for vision transformers whose most important component is +randomizing the execution order of attention modules at training time. We show +that with our proposed approaches, vision transformers are indeed capable to +adapt to arbitrary layer execution orders at test time assuming one tolerates a +reduction (about 20\%) in accuracy at the same model size. We also find that +our trained models can be randomly merged with each other resulting in +functional ("Frankenstein") models without loss of performance compared to the +source models. Finally, we layer-prune our models at test time and find that +their performance declines gracefully. + +
+
+
+
+
+ + ☆ Few-Shot Airway-Tree Modeling using Data-Driven Sparse Priors + + +
+ The lack of large annotated datasets in medical imaging is an intrinsic +burden for supervised Deep Learning (DL) segmentation models. Few-shot learning +approaches are cost-effective solutions to transfer pre-trained models using +only limited annotated data. However, such methods can be prone to overfitting +due to limited data diversity especially when segmenting complex, diverse, and +sparse tubular structures like airways. Furthermore, crafting informative image +representations has played a crucial role in medical imaging, enabling +discriminative enhancement of anatomical details. In this paper, we initially +train a data-driven sparsification module to enhance airways efficiently in +lung CT scans. We then incorporate these sparse representations in a standard +supervised segmentation pipeline as a pretraining step to enhance the +performance of the DL models. Results presented on the ATM public challenge +cohort show the effectiveness of using sparse priors in pre-training, leading +to segmentation Dice score increase by 1% to 10% in full-scale and few-shot +learning scenarios, respectively. + +
+
+ comment: Accepted at 21st IEEE International Symposium on Biomedical Imaging + (ISBI) +
+
+
+
+
+ + ☆ Speed-accuracy trade-off for the diffusion models: Wisdom from + nonequlibrium thermodynamics and optimal transport + + +
+ We discuss a connection between a generative model, called the diffusion +model, and nonequilibrium thermodynamics for the Fokker-Planck equation, called +stochastic thermodynamics. Based on the techniques of stochastic +thermodynamics, we derive the speed-accuracy trade-off for the diffusion +models, which is a trade-off relationship between the speed and accuracy of +data generation in diffusion models. Our result implies that the entropy +production rate in the forward process affects the errors in data generation. +From a stochastic thermodynamic perspective, our results provide quantitative +insight into how best to generate data in diffusion models. The optimal +learning protocol is introduced by the conservative force in stochastic +thermodynamics and the geodesic of space by the 2-Wasserstein distance in +optimal transport theory. We numerically illustrate the validity of the +speed-accuracy trade-off for the diffusion models with different noise +schedules such as the cosine schedule, the conditional optimal transport, and +the optimal transport. + +
+
+ comment: 26 pages, 5 figures +
+
+
+
+
+ + ☆ PROUD: PaRetO-gUided Diffusion Model for Multi-objective Generation + + +
+ Recent advancements in the realm of deep generative models focus on +generating samples that satisfy multiple desired properties. However, prevalent +approaches optimize these property functions independently, thus omitting the +trade-offs among them. In addition, the property optimization is often +improperly integrated into the generative models, resulting in an unnecessary +compromise on generation quality (i.e., the quality of generated samples). To +address these issues, we formulate a constrained optimization problem. It seeks +to optimize generation quality while ensuring that generated samples reside at +the Pareto front of multiple property objectives. Such a formulation enables +the generation of samples that cannot be further improved simultaneously on the +conflicting property functions and preserves good quality of generated samples. +Building upon this formulation, we introduce the PaRetO-gUided Diffusion model +(PROUD), wherein the gradients in the denoising process are dynamically +adjusted to enhance generation quality while the generated samples adhere to +Pareto optimality. Experimental evaluations on image generation and protein +generation tasks demonstrate that our PROUD consistently maintains superior +generation quality while approaching Pareto optimality across multiple property +functions compared to various baselines. + +
+
+
+
+
+ + ☆ Better by Default: Strong Pre-Tuned MLPs and Boosted Trees on Tabular + Data + + +
+ For classification and regression on tabular data, the dominance of +gradient-boosted decision trees (GBDTs) has recently been challenged by often +much slower deep learning methods with extensive hyperparameter tuning. We +address this discrepancy by introducing (a) RealMLP, an improved multilayer +perceptron (MLP), and (b) improved default parameters for GBDTs and RealMLP. We +tune RealMLP and the default parameters on a meta-train benchmark with 71 +classification and 47 regression datasets and compare them to +hyperparameter-optimized versions on a disjoint meta-test benchmark with 48 +classification and 42 regression datasets, as well as the GBDT-friendly +benchmark by Grinsztajn et al. (2022). Our benchmark results show that RealMLP +offers a better time-accuracy tradeoff than other neural nets and is +competitive with GBDTs. Moreover, a combination of RealMLP and GBDTs with +improved default parameters can achieve excellent results on medium-sized +tabular datasets (1K--500K samples) without hyperparameter tuning. + +
+
+ comment: 10 pages + 44 pages appendix. Code is available at + github.com/dholzmueller/pytabkit and + github.com/LeoGrin/tabular-benchmark/tree/better_by_default +
+
+
+
+
+ + ☆ Leveraging Graph Structures to Detect Hallucinations in Large Language + Models + + +
+ Large language models are extensively applied across a wide range of tasks, +such as customer support, content creation, educational tutoring, and providing +financial guidance. However, a well-known drawback is their predisposition to +generate hallucinations. This damages the trustworthiness of the information +these models provide, impacting decision-making and user confidence. We propose +a method to detect hallucinations by looking at the structure of the latent +space and finding associations within hallucinated and non-hallucinated +generations. We create a graph structure that connects generations that lie +closely in the embedding space. Moreover, we employ a Graph Attention Network +which utilizes message passing to aggregate information from neighboring nodes +and assigns varying degrees of importance to each neighbor based on their +relevance. Our findings show that 1) there exists a structure in the latent +space that differentiates between hallucinated and non-hallucinated +generations, 2) Graph Attention Networks can learn this structure and +generalize it to unseen generations, and 3) the robustness of our method is +enhanced when incorporating contrastive learning. When evaluated against +evidence-based benchmarks, our model performs similarly without access to +search-based methods. + +
+
+
+
+
+ + ☆ Using Petri Nets as an Integrated Constraint Mechanism for Reinforcement + Learning Tasks + + +
+ The lack of trust in algorithms is usually an issue when using Reinforcement +Learning (RL) agents for control in real-world domains such as production +plants, autonomous vehicles, or traffic-related infrastructure, partly due to +the lack of verifiability of the model itself. In such scenarios, Petri nets +(PNs) are often available for flowcharts or process steps, as they are +versatile and standardized. In order to facilitate integration of RL models and +as a step towards increasing AI trustworthiness, we propose an approach that +uses PNs with three main advantages over typical RL approaches: Firstly, the +agent can now easily be modeled with a combined state including both external +environmental observations and agent-specific state information from a given +PN. Secondly, we can enforce constraints for state-dependent actions through +the inherent PN model. And lastly, we can increase trustworthiness by verifying +PN properties through techniques such as model checking. We test our approach +on a typical four-way intersection traffic light control setting and present +our results, beating cycle-based baselines. + +
+
+
+
+
+ + ☆ LoCo: Low-Bit Communication Adaptor for Large-scale Model Training + + +
+ To efficiently train large-scale models, low-bit gradient communication +compresses full-precision gradients on local GPU nodes into low-precision ones +for higher gradient synchronization efficiency among GPU nodes. However, it +often degrades training quality due to compression information loss. To address +this, we propose the Low-bit Communication Adaptor (LoCo), which compensates +gradients on local GPU nodes before compression, ensuring efficient +synchronization without compromising training quality. Specifically, LoCo +designs a moving average of historical compensation errors to stably estimate +concurrent compression error and then adopts it to compensate for the +concurrent gradient compression, yielding a less lossless compression. This +mechanism allows it to be compatible with general optimizers like Adam and +sharding strategies like FSDP. Theoretical analysis shows that integrating LoCo +into full-precision optimizers like Adam and SGD does not impair their +convergence speed on nonconvex problems. Experimental results show that across +large-scale model training frameworks like Megatron-LM and PyTorch's FSDP, LoCo +significantly improves communication efficiency, e.g., improving Adam's +training speed by 14% to 40% without performance degradation on large language +models like LLAMAs and MoE. + +
+
+
+
+
+ + ☆ Rethinking Data Input for Point Cloud Upsampling + + +
+ In recent years, point cloud upsampling has been widely applied in fields +such as 3D reconstruction and surface generation. However, existing point cloud +upsampling inputs are all patch based, and there is no research discussing the +differences and principles between point cloud model full input and patch based +input. In order to compare with patch based point cloud input, this article +proposes a new data input method, which divides the full point cloud model to +ensure shape integrity while training PU-GCN. This article was validated on the +PU1K and ABC datasets, but the results showed that Patch based performance is +better than model based full input i.e. Average Segment input. Therefore, this +article explores the data input factors and model modules that affect the +upsampling results of point clouds. + +
+
+ comment: 16 pages, 6 figures +
+
+
+
+
+ + ☆ EventChat: Implementation and user-centric evaluation of a large + language model-driven conversational recommender system for exploring leisure + events in an SME context + + +
+ Large language models (LLMs) present an enormous evolution in the strategic +potential of conversational recommender systems (CRS). Yet to date, research +has predominantly focused upon technical frameworks to implement LLM-driven +CRS, rather than end-user evaluations or strategic implications for firms, +particularly from the perspective of a small to medium enterprises (SME) that +makeup the bedrock of the global economy. In the current paper, we detail the +design of an LLM-driven CRS in an SME setting, and its subsequent performance +in the field using both objective system metrics and subjective user +evaluations. While doing so, we additionally outline a short-form revised +ResQue model for evaluating LLM-driven CRS, enabling replicability in a rapidly +evolving field. Our results reveal good system performance from a user +experience perspective (85.5% recommendation accuracy) but underscore latency, +cost, and quality issues challenging business viability. Notably, with a median +cost of $0.04 per interaction and a latency of 5.7s, cost-effectiveness and +response time emerge as crucial areas for achieving a more user-friendly and +economically viable LLM-driven CRS for SME settings. One major driver of these +costs is the use of an advanced LLM as a ranker within the retrieval-augmented +generation (RAG) technique. Our results additionally indicate that relying +solely on approaches such as Prompt-based learning with ChatGPT as the +underlying LLM makes it challenging to achieve satisfying quality in a +production environment. Strategic considerations for SMEs deploying an +LLM-driven CRS are outlined, particularly considering trade-offs in the current +technical landscape. + +
+
+ comment: 27 pages, 3 tables, 5 figures, pre-print manuscript +
+
+
+
+
+ + ☆ Smart Sampling: Helping from Friendly Neighbors for Decentralized + Federated Learning + + +
+ Federated Learning (FL) is gaining widespread interest for its ability to +share knowledge while preserving privacy and reducing communication costs. +Unlike Centralized FL, Decentralized FL (DFL) employs a network architecture +that eliminates the need for a central server, allowing direct communication +among clients and leading to significant communication resource savings. +However, due to data heterogeneity, not all neighboring nodes contribute to +enhancing the local client's model performance. In this work, we introduce +\textbf{\emph{AFIND+}}, a simple yet efficient algorithm for sampling and +aggregating neighbors in DFL, with the aim of leveraging collaboration to +improve clients' model performance. AFIND+ identifies helpful neighbors, +adaptively adjusts the number of selected neighbors, and strategically +aggregates the sampled neighbors' models based on their contributions. +Numerical results on real-world datasets with diverse data partitions +demonstrate that AFIND+ outperforms other sampling algorithms in DFL and is +compatible with most existing DFL optimization algorithms. + +
+
+
+
+
+ + ☆ Hindsight Preference Learning for Offline Preference-based Reinforcement + Learning + + +
+ Offline preference-based reinforcement learning (RL), which focuses on +optimizing policies using human preferences between pairs of trajectory +segments selected from an offline dataset, has emerged as a practical avenue +for RL applications. Existing works rely on extracting step-wise reward signals +from trajectory-wise preference annotations, assuming that preferences +correlate with the cumulative Markovian rewards. However, such methods fail to +capture the holistic perspective of data annotation: Humans often assess the +desirability of a sequence of actions by considering the overall outcome rather +than the immediate rewards. To address this challenge, we propose to model +human preferences using rewards conditioned on future outcomes of the +trajectory segments, i.e. the hindsight information. For downstream RL +optimization, the reward of each step is calculated by marginalizing over +possible future outcomes, the distribution of which is approximated by a +variational auto-encoder trained using the offline dataset. Our proposed +method, Hindsight Preference Learning (HPL), can facilitate credit assignment +by taking full advantage of vast trajectory data available in massive unlabeled +datasets. Comprehensive empirical studies demonstrate the benefits of HPL in +delivering robust and advantageous rewards across various domains. Our code is +publicly released at https://github.com/typoverflow/WiseRL. + +
+
+
+
+
+ + ☆ Multi-modal Masked Siamese Network Improves Chest X-Ray Representation + Learning + + +
+ Self-supervised learning methods for medical images primarily rely on the +imaging modality during pretraining. While such approaches deliver promising +results, they do not leverage associated patient or scan information collected +within Electronic Health Records (EHR). Here, we propose to incorporate EHR +data during self-supervised pretraining with a Masked Siamese Network (MSN) to +enhance the quality of chest X-ray representations. We investigate three types +of EHR data, including demographic, scan metadata, and inpatient stay +information. We evaluate our approach on three publicly available chest X-ray +datasets, MIMIC-CXR, CheXpert, and NIH-14, using two vision transformer (ViT) +backbones, specifically ViT-Tiny and ViT-Small. In assessing the quality of the +representations via linear evaluation, our proposed method demonstrates +significant improvement compared to vanilla MSN and state-of-the-art +self-supervised learning baselines. Our work highlights the potential of +EHR-enhanced self-supervised pre-training for medical imaging. The code is +publicly available at: https://github.com/nyuad-cai/CXR-EHR-MSN + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Wavelet-based Temporal Attention Improves Traffic Forecasting + + +
+ Spatio-temporal forecasting of traffic flow data represents a typical problem +in the field of machine learning, impacting urban traffic management systems. +Traditional statistical and machine learning methods cannot adequately handle +both the temporal and spatial dependencies in these complex traffic flow +datasets. A prevalent approach in the field is to combine graph convolutional +networks and multi-head attention mechanisms for spatio-temporal processing. +This paper proposes a wavelet-based temporal attention model, namely a +wavelet-based dynamic spatio-temporal aware graph neural network (W-DSTAGNN), +for tackling the traffic forecasting problem. Benchmark experiments using +several statistical metrics confirm that our proposal efficiently captures +spatio-temporal correlations and outperforms ten state-of-the-art models on +three different real-world traffic datasets. Our proposed ensemble data-driven +method can handle dynamic temporal and spatial dependencies and make long-term +forecasts in an efficient manner. + +
+
+
+
+
+ + ☆ Enabling On-Device LLMs Personalization with Smartphone Sensing + + +
+ This demo presents a novel end-to-end framework that combines on-device large +language models (LLMs) with smartphone sensing technologies to achieve +context-aware and personalized services. The framework addresses critical +limitations of current personalization solutions via cloud-based LLMs, such as +privacy concerns, latency and cost, and limited personal sensor data. To +achieve this, we innovatively proposed deploying LLMs on smartphones with +multimodal sensor data and customized prompt engineering, ensuring privacy and +enhancing personalization performance through context-aware sensing. A case +study involving a university student demonstrated the proposed framework's +capability to provide tailored recommendations. In addition, we show that the +proposed framework achieves the best trade-off in privacy, performance, +latency, cost, battery and energy consumption between on-device and cloud LLMs. +Future work aims to integrate more diverse sensor data and conduct large-scale +user studies to further refine the personalization. We envision the proposed +framework could significantly improve user experiences in various domains such +as healthcare, productivity, and entertainment by providing secure, +context-aware, and efficient interactions directly on users' devices. + +
+
+ comment: 5 pages, 3 figures, conference demo paper +
+
+
+
+
+ + ☆ Trustworthy Classification through Rank-Based Conformal Prediction Sets + + +
+ Machine learning classification tasks often benefit from predicting a set of +possible labels with confidence scores to capture uncertainty. However, +existing methods struggle with the high-dimensional nature of the data and the +lack of well-calibrated probabilities from modern classification models. We +propose a novel conformal prediction method that employs a rank-based score +function suitable for classification models that predict the order of labels +correctly, even if not well-calibrated. Our approach constructs prediction sets +that achieve the desired coverage rate while managing their size. We provide a +theoretical analysis of the expected size of the conformal prediction sets +based on the rank distribution of the underlying classifier. Through extensive +experiments, we demonstrate that our method outperforms existing techniques on +various datasets, providing reliable uncertainty quantification. Our +contributions include a novel conformal prediction method, theoretical +analysis, and empirical evaluation. This work advances the practical deployment +of machine learning systems by enabling reliable uncertainty quantification. + +
+
+
+
+
+ + ☆ On Quantum Channel Learning + + +
+ The problem of an optimal mapping between Hilbert spaces $IN$ and $OUT$, +based on a series of density matrix mapping measurements $\rho^{(l)} \to +\varrho^{(l)}$, $l=1\dots M$, is formulated as an optimization problem +maximizing the total fidelity $\mathcal{F}=\sum_{l=1}^{M} \omega^{(l)} +F\left(\varrho^{(l)},\sum_s B_s \rho^{(l)} B^{\dagger}_s\right)$ subject to +probability preservation constraints on Kraus operators $B_s$. For +$F(\varrho,\sigma)$ in the form that total fidelity can be represented as a +quadratic form with superoperator $\mathcal{F}=\sum_s\left\langle +B_s\middle|S\middle| B_s \right\rangle$ (either exactly or as an approximation) +an iterative algorithm is developed to find the global maximum. The result +comprises in $N_s$ operators $B_s$ that collectively form an $IN$ to $OUT$ +quantum channel $A^{OUT}=\sum_s B_s A^{IN} B_s^{\dagger}$. The work introduces +two important generalizations of unitary learning: 1. $IN$/$OUT$ states are +represented as density matrices. 2. The mapping itself is formulated as a +general quantum channel. This marks a crucial advancement from the commonly +studied unitary mapping of pure states $\phi_l=\mathcal{U} \psi_l$ to a general +quantum channel, what allows us to distinguish probabilistic mixture of states +and their superposition. An application of the approach is demonstrated on +unitary learning of density matrix mapping $\varrho^{(l)}=\mathcal{U} +\rho^{(l)} \mathcal{U}^{\dagger}$, in this case a quadratic on $\mathcal{U}$ +fidelity can be constructed by considering $\sqrt{\rho^{(l)}} \to +\sqrt{\varrho^{(l)}}$ mapping, and on a general quantum channel of Kraus rank +$N_s$, where quadratic on $B_s$ fidelity is an approximation -- a quantum +channel is then built as a hierarchy of unitary mappings. The approach can be +applied to study decoherence effects, spontaneous coherence, synchronizing, +etc. + +
+
+ comment: The unitary learning from arXiv:2405.10263 is generalized to density + matrices and quantum channels +
+
+
+
+
+ + ☆ Discovering symbolic expressions with parallelized tree search + + +
+ Symbolic regression plays a crucial role in modern scientific research thanks +to its capability of discovering concise and interpretable mathematical +expressions from data. A grand challenge lies in the arduous search for +parsimonious and generalizable mathematical formulas, in an infinite search +space, while intending to fit the training data. Existing algorithms have faced +a critical bottleneck of accuracy and efficiency over a decade when handling +problems of complexity, which essentially hinders the pace of applying symbolic +regression for scientific exploration across interdisciplinary domains. To this +end, we introduce a parallelized tree search (PTS) model to efficiently distill +generic mathematical expressions from limited data. Through a series of +extensive experiments, we demonstrate the superior accuracy and efficiency of +PTS for equation discovery, which greatly outperforms the state-of-the-art +baseline models on over 80 synthetic and experimental datasets (e.g., lifting +its performance by up to 99% accuracy improvement and one-order of magnitude +speed up). PTS represents a key advance in accurate and efficient data-driven +discovery of symbolic, interpretable models (e.g., underlying physical laws) +and marks a pivotal transition towards scalable symbolic learning. + +
+
+
+
+
+ + ☆ Hard-Attention Gates with Gradient Routing for Endoscopic Image + Computing + + +
+ To address overfitting and enhance model generalization in +gastroenterological polyp size assessment, our study introduces +Feature-Selection Gates (FSG) or Hard-Attention Gates (HAG) alongside Gradient +Routing (GR) for dynamic feature selection. This technique aims to boost +Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) by +promoting sparse connectivity, thereby reducing overfitting and enhancing +generalization. HAG achieves this through sparsification with learnable +weights, serving as a regularization strategy. GR further refines this process +by optimizing HAG parameters via dual forward passes, independently from the +main model, to improve feature re-weighting. Our evaluation spanned multiple +datasets, including CIFAR-100 for a broad impact assessment and specialized +endoscopic datasets (REAL-Colon, Misawa, and SUN) focusing on polyp size +estimation, covering over 200 polyps in more than 370,000 frames. The findings +indicate that our HAG-enhanced networks substantially enhance performance in +both binary and triclass classification tasks related to polyp sizing. +Specifically, CNNs experienced an F1 Score improvement to 87.8% in binary +classification, while in triclass classification, the ViT-T model reached an F1 +Score of 76.5%, outperforming traditional CNNs and ViT-T models. To facilitate +further research, we are releasing our codebase, which includes implementations +for CNNs, multistream CNNs, ViT, and HAG-augmented variants. This resource aims +to standardize the use of endoscopic datasets, providing public +training-validation-testing splits for reliable and comparable research in +gastroenterological polyp size estimation. The codebase is available at +github.com/cosmoimd/feature-selection-gates. + +
+
+ comment: Attention Gates, Hard-Attention Gates, Gradient Routing, Feature + Selection Gates, Endoscopy, Medical Image Processing, Computer Vision +
+
+
+
+
+ + ☆ Function Smoothing Regularization for Precision Factorization Machine + Annealing in Continuous Variable Optimization Problems + + +
+ Solving continuous variable optimization problems by factorization machine +quantum annealing (FMQA) demonstrates the potential of Ising machines to be +extended as a solver for integer and real optimization problems. However, the +details of the Hamiltonian function surface obtained by factorization machine +(FM) have been overlooked. This study shows that in the widely common case +where real numbers are represented by a combination of binary variables, the +function surface of the Hamiltonian obtained by FM can be very noisy. This +noise interferes with the inherent capabilities of quantum annealing and is +likely to be a substantial cause of problems previously considered unsolvable +due to the limitations of FMQA performance. The origin of the noise is +identified and a simple, general method is proposed to prevent its occurrence. +The generalization performance of the proposed method and its ability to solve +practical problems is demonstrated. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Regulating Model Reliance on Non-Robust Features by Smoothing Input + Marginal Density + + +
+ Trustworthy machine learning necessitates meticulous regulation of model +reliance on non-robust features. We propose a framework to delineate and +regulate such features by attributing model predictions to the input. Within +our approach, robust feature attributions exhibit a certain consistency, while +non-robust feature attributions are susceptible to fluctuations. This behavior +allows identification of correlation between model reliance on non-robust +features and smoothness of marginal density of the input samples. Hence, we +uniquely regularize the gradients of the marginal density w.r.t. the input +features for robustness. We also devise an efficient implementation of our +regularization to address the potential numerical instability of the underlying +optimization process. Moreover, we analytically reveal that, as opposed to our +marginal density smoothing, the prevalent input gradient regularization +smoothens conditional or joint density of the input, which can cause limited +robustness. Our experiments validate the effectiveness of the proposed method, +providing clear evidence of its capability to address the feature leakage +problem and mitigate spurious correlations. Extensive results further establish +that our technique enables the model to exhibit robustness against +perturbations in pixel values, input gradients, and density. + +
+
+
+
+
+ + ☆ An Adaptive Stochastic Gradient Method with Non-negative Gauss-Newton + Stepsizes + + +
+ We consider the problem of minimizing the average of a large number of smooth +but possibly non-convex functions. In the context of most machine learning +applications, each loss function is non-negative and thus can be expressed as +the composition of a square and its real-valued square root. This reformulation +allows us to apply the Gauss-Newton method, or the Levenberg-Marquardt method +when adding a quadratic regularization. The resulting algorithm, while being +computationally as efficient as the vanilla stochastic gradient method, is +highly adaptive and can automatically warmup and decay the effective stepsize +while tracking the non-negative loss landscape. We provide a tight convergence +analysis, leveraging new techniques, in the stochastic convex and non-convex +settings. In particular, in the convex case, the method does not require access +to the gradient Lipshitz constant for convergence, and is guaranteed to never +diverge. The convergence rates and empirical evaluations compare favorably to +the classical (stochastic) gradient method as well as to several other adaptive +methods. + +
+
+
+
+
+ + ☆ UpStory: the Uppsala Storytelling dataset + + +
+ Friendship and rapport play an important role in the formation of +constructive social interactions, and have been widely studied in educational +settings due to their impact on student outcomes. Given the growing interest in +automating the analysis of such phenomena through Machine Learning (ML), access +to annotated interaction datasets is highly valuable. However, no dataset on +dyadic child-child interactions explicitly capturing rapport currently exists. +Moreover, despite advances in the automatic analysis of human behaviour, no +previous work has addressed the prediction of rapport in child-child dyadic +interactions in educational settings. We present UpStory -- the Uppsala +Storytelling dataset: a novel dataset of naturalistic dyadic interactions +between primary school aged children, with an experimental manipulation of +rapport. Pairs of children aged 8-10 participate in a task-oriented activity: +designing a story together, while being allowed free movement within the play +area. We promote balanced collection of different levels of rapport by using a +within-subjects design: self-reported friendships are used to pair each child +twice, either minimizing or maximizing pair separation in the friendship +network. The dataset contains data for 35 pairs, totalling 3h 40m of audio and +video recordings. It includes two video sources covering the play area, as well +as separate voice recordings for each child. An anonymized version of the +dataset is made publicly available, containing per-frame head pose, body pose, +and face features; as well as per-pair information, including the level of +rapport. Finally, we provide ML baselines for the prediction of rapport. + +
+
+
+
+
+ + ☆ Enhancing Safety for Autonomous Agents in Partly Concealed Urban Traffic + Environments Through Representation-Based Shielding + + +
+ Navigating unsignalized intersections in urban environments poses a complex +challenge for self-driving vehicles, where issues such as view obstructions, +unpredictable pedestrian crossings, and diverse traffic participants demand a +great focus on crash prevention. In this paper, we propose a novel state +representation for Reinforcement Learning (RL) agents centered around the +information perceivable by an autonomous agent, enabling the safe navigation of +previously uncharted road maps. Our approach surpasses several baseline models +by a sig nificant margin in terms of safety and energy consumption metrics. +These improvements are achieved while maintaining a competitive average travel +speed. Our findings pave the way for more robust and reliable autonomous +navigation strategies, promising safer and more efficient urban traffic +environments. + +
+
+
+
+
+ + ☆ Geometrically Inspired Kernel Machines for Collaborative Learning Beyond + Gradient Descent + + +
+ This paper develops a novel mathematical framework for collaborative learning +by means of geometrically inspired kernel machines which includes statements on +the bounds of generalisation and approximation errors, and sample complexity. +For classification problems, this approach allows us to learn bounded geometric +structures around given data points and hence solve the global model learning +problem in an efficient way by exploiting convexity properties of the related +optimisation problem in a Reproducing Kernel Hilbert Space (RKHS). In this way, +we can reduce classification problems to determining the closest bounded +geometric structure from a given data point. Further advantages that come with +our solution is that our approach does not require clients to perform multiple +epochs of local optimisation using stochastic gradient descent, nor require +rounds of communication between client/server for optimising the global model. +We highlight that numerous experiments have shown that the proposed method is a +competitive alternative to the state-of-the-art. + +
+
+
+
+
+ + ☆ Learning Geometric Invariant Features for Classification of Vector + Polygons with Graph Message-passing Neural Network + + +
+ Geometric shape classification of vector polygons remains a non-trivial +learning task in spatial analysis. Previous studies mainly focus on devising +deep learning approaches for representation learning of rasterized vector +polygons, whereas the study of discrete representations of polygons and +subsequent deep learning approaches have not been fully investigated. In this +study, we investigate a graph representation of vector polygons and propose a +novel graph message-passing neural network (PolyMP) to learn the +geometric-invariant features for shape classification of polygons. Through +extensive experiments, we show that the graph representation of polygons +combined with a permutation-invariant graph message-passing neural network +achieves highly robust performances on benchmark datasets (i.e., synthetic +glyph and real-world building footprint datasets) as compared to baseline +methods. We demonstrate that the proposed graph-based PolyMP network enables +the learning of expressive geometric features invariant to geometric +transformations of polygons (i.e., translation, rotation, scaling and shearing) +and is robust to trivial vertex removals of polygons. We further show the +strong generalizability of PolyMP, which enables generalizing the learned +geometric features from the synthetic glyph polygons to the real-world building +footprints. + +
+
+
+
+
+ + ☆ EAGERx: Graph-Based Framework for Sim2real Robot Learning + + +
+ Sim2real, that is, the transfer of learned control policies from simulation +to real world, is an area of growing interest in robotics due to its potential +to efficiently handle complex tasks. The sim2real approach faces challenges due +to mismatches between simulation and reality. These discrepancies arise from +inaccuracies in modeling physical phenomena and asynchronous control, among +other factors. To this end, we introduce EAGERx, a framework with a unified +software pipeline for both real and simulated robot learning. It can support +various simulators and aids in integrating state, action and time-scale +abstractions to facilitate learning. EAGERx's integrated delay simulation, +domain randomization features, and proposed synchronization algorithm +contribute to narrowing the sim2real gap. We demonstrate (in the context of +robot learning and beyond) the efficacy of EAGERx in accommodating diverse +robotic systems and maintaining consistent simulation behavior. EAGERx is open +source and its code is available at https://eagerx.readthedocs.io. + +
+
+ comment: For an introductory video, see + http://www.youtube.com/watch?v=D0CQNnTT010 . The documentation, tutorials, + and our open-source code can be found at http://eagerx.readthedocs.io +
+
+
+
+
+ + ☆ Understanding the Role of Invariance in Transfer Learning + + +
+ Transfer learning is a powerful technique for knowledge-sharing between +different tasks. Recent work has found that the representations of models with +certain invariances, such as to adversarial input perturbations, achieve higher +performance on downstream tasks. These findings suggest that invariance may be +an important property in the context of transfer learning. However, the +relationship of invariance with transfer performance is not fully understood +yet and a number of questions remain. For instance, how important is invariance +compared to other factors of the pretraining task? How transferable is learned +invariance? In this work, we systematically investigate the importance of +representational invariance for transfer learning, as well as how it interacts +with other parameters during pretraining. To do so, we introduce a family of +synthetic datasets that allow us to precisely control factors of variation both +in training and test data. Using these datasets, we a) show that for learning +representations with high transfer performance, invariance to the right +transformations is as, or often more, important than most other factors such as +the number of training samples, the model architecture and the identity of the +pretraining classes, b) show conditions under which invariance can harm the +ability to transfer representations and c) explore how transferable invariance +is between tasks. The code is available at +\url{https://github.com/tillspeicher/representation-invariance-transfer}. + +
+
+ comment: Published at TMLR 2024 +
+
+
+
+
+ + ☆ SSP-GNN: Learning to Track via Bilevel Optimization + + +
+ We propose a graph-based tracking formulation for multi-object tracking (MOT) +where target detections contain kinematic information and re-identification +features (attributes). Our method applies a successive shortest paths (SSP) +algorithm to a tracking graph defined over a batch of frames. The edge costs in +this tracking graph are computed via a message-passing network, a graph neural +network (GNN) variant. The parameters of the GNN, and hence, the tracker, are +learned end-to-end on a training set of example ground-truth tracks and +detections. Specifically, learning takes the form of bilevel optimization +guided by our novel loss function. We evaluate our algorithm on simulated +scenarios to understand its sensitivity to scenario aspects and model +hyperparameters. Across varied scenario complexities, our method compares +favorably to a strong baseline. + +
+
+
+
+
+ + ☆ Crafting Large Language Models for Enhanced Interpretability ICML 2024 + + +
+ We introduce the Concept Bottleneck Large Language Model (CB-LLM), a +pioneering approach to creating inherently interpretable Large Language Models +(LLMs). Unlike traditional black-box LLMs that rely on post-hoc interpretation +methods with limited neuron function insights, CB-LLM sets a new standard with +its built-in interpretability, scalability, and ability to provide clear, +accurate explanations. This innovation not only advances transparency in +language models but also enhances their effectiveness. Our unique Automatic +Concept Correction (ACC) strategy successfully narrows the performance gap with +conventional black-box LLMs, positioning CB-LLM as a model that combines the +high accuracy of traditional LLMs with the added benefit of clear +interpretability -- a feature markedly absent in existing LLMs. + +
+
+ comment: Present at ICML 2024 Mechanistic Interpretability (MI) Workshop +
+
+
+
+
+ + ☆ Fair Federated Data Clustering through Personalization: Bridging the Gap + between Diverse Data Distributions + + +
+ The rapid growth of data from edge devices has catalyzed the performance of +machine learning algorithms. However, the data generated resides at client +devices thus there are majorly two challenge faced by traditional machine +learning paradigms - centralization of data for training and secondly for most +the generated data the class labels are missing and there is very poor +incentives to clients to manually label their data owing to high cost and lack +of expertise. To overcome these issues, there have been initial attempts to +handle unlabelled data in a privacy preserving distributed manner using +unsupervised federated data clustering. The goal is partition the data +available on clients into $k$ partitions (called clusters) without actual +exchange of data. Most of the existing algorithms are highly dependent on data +distribution patterns across clients or are computationally expensive. +Furthermore, due to presence of skewed nature of data across clients in most of +practical scenarios existing models might result in clients suffering high +clustering cost making them reluctant to participate in federated process. To +this, we are first to introduce the idea of personalization in federated +clustering. The goal is achieve balance between achieving lower clustering cost +and at same time achieving uniform cost across clients. We propose p-FClus that +addresses these goal in a single round of communication between server and +clients. We validate the efficacy of p-FClus against variety of federated +datasets showcasing it's data independence nature, applicability to any finite +$\ell$-norm, while simultaneously achieving lower cost and variance. + +
+
+
+
+
+ + ☆ Jailbreak Attacks and Defenses Against Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have performed exceptionally in various +text-generative tasks, including question answering, translation, code +completion, etc. However, the over-assistance of LLMs has raised the challenge +of "jailbreaking", which induces the model to generate malicious responses +against the usage policy and society by designing adversarial prompts. With the +emergence of jailbreak attack methods exploiting different vulnerabilities in +LLMs, the corresponding safety alignment measures are also evolving. In this +paper, we propose a comprehensive and detailed taxonomy of jailbreak attack and +defense methods. For instance, the attack methods are divided into black-box +and white-box attacks based on the transparency of the target model. Meanwhile, +we classify defense methods into prompt-level and model-level defenses. +Additionally, we further subdivide these attack and defense methods into +distinct sub-classes and present a coherent diagram illustrating their +relationships. We also conduct an investigation into the current evaluation +methods and compare them from different perspectives. Our findings aim to +inspire future research and practical implementations in safeguarding LLMs +against adversarial attacks. Above all, although jailbreak remains a +significant concern within the community, we believe that our work enhances the +understanding of this domain and provides a foundation for developing more +secure LLMs. + +
+
+
+
+
+ + ☆ We Need Variations in Speech Synthesis: Sub-center Modelling for Speaker + Embeddings + + +
+ In speech synthesis, modeling of rich emotions and prosodic variations +present in human voice are crucial to synthesize natural speech. Although +speaker embeddings have been widely used in personalized speech synthesis as +conditioning inputs, they are designed to lose variation to optimize speaker +recognition accuracy. Thus, they are suboptimal for speech synthesis in terms +of modeling the rich variations at the output speech distribution. In this +work, we propose a novel speaker embedding network which utilizes multiple +class centers in the speaker classification training rather than a single class +center as traditional embeddings. The proposed approach introduces variations +in the speaker embedding while retaining the speaker recognition performance +since model does not have to map all of the utterances of a speaker into a +single class center. We apply our proposed embedding in voice conversion task +and show that our method provides better naturalness and prosody in synthesized +speech. + +
+
+ comment: Submitted to IEEE Signal Processing Letters +
+
+
+
+
+ + ☆ Robust Decision Transformer: Tackling Data Corruption in Offline RL via + Sequence Modeling + + +
+ Learning policies from offline datasets through offline reinforcement +learning (RL) holds promise for scaling data-driven decision-making and +avoiding unsafe and costly online interactions. However, real-world data +collected from sensors or humans often contains noise and errors, posing a +significant challenge for existing offline RL methods. Our study indicates that +traditional offline RL methods based on temporal difference learning tend to +underperform Decision Transformer (DT) under data corruption, especially when +the amount of data is limited. This suggests the potential of sequential +modeling for tackling data corruption in offline RL. To further unleash the +potential of sequence modeling methods, we propose Robust Decision Transformer +(RDT) by incorporating several robust techniques. Specifically, we introduce +Gaussian weighted learning and iterative data correction to reduce the effect +of corrupted data. Additionally, we leverage embedding dropout to enhance the +model's resistance to erroneous inputs. Extensive experiments on MoJoCo, +KitChen, and Adroit tasks demonstrate RDT's superior performance under diverse +data corruption compared to previous methods. Moreover, RDT exhibits remarkable +robustness in a challenging setting that combines training-time data corruption +with testing-time observation perturbations. These results highlight the +potential of robust sequence modeling for learning from noisy or corrupted +offline datasets, thereby promoting the reliable application of offline RL in +real-world tasks. + +
+
+
+
+
+ + ☆ BiosERC: Integrating Biography Speakers Supported by LLMs for ERC Tasks ICANN 2024 + + +
+ In the Emotion Recognition in Conversation task, recent investigations have +utilized attention mechanisms exploring relationships among utterances from +intra- and inter-speakers for modeling emotional interaction between them. +However, attributes such as speaker personality traits remain unexplored and +present challenges in terms of their applicability to other tasks or +compatibility with diverse model architectures. Therefore, this work introduces +a novel framework named BiosERC, which investigates speaker characteristics in +a conversation. By employing Large Language Models (LLMs), we extract the +"biographical information" of the speaker within a conversation as +supplementary knowledge injected into the model to classify emotional labels +for each utterance. Our proposed method achieved state-of-the-art (SOTA) +results on three famous benchmark datasets: IEMOCAP, MELD, and EmoryNLP, +demonstrating the effectiveness and generalization of our model and showcasing +its potential for adaptation to various conversation analysis tasks. Our source +code is available at https://github.com/yingjie7/BiosERC. + +
+
+ comment: Accepted in the 33rd International Conference on Artificial Neural + Networks (ICANN 2024) +
+
+
+
+
+ + ☆ Accelerating Communication in Deep Learning Recommendation Model + Training with Dual-Level Adaptive Lossy Compression SC '24 + + +
+ DLRM is a state-of-the-art recommendation system model that has gained +widespread adoption across various industry applications. The large size of +DLRM models, however, necessitates the use of multiple devices/GPUs for +efficient training. A significant bottleneck in this process is the +time-consuming all-to-all communication required to collect embedding data from +all devices. To mitigate this, we introduce a method that employs error-bounded +lossy compression to reduce the communication data size and accelerate DLRM +training. We develop a novel error-bounded lossy compression algorithm, +informed by an in-depth analysis of embedding data features, to achieve high +compression ratios. Moreover, we introduce a dual-level adaptive strategy for +error-bound adjustment, spanning both table-wise and iteration-wise aspects, to +balance the compression benefits with the potential impacts on accuracy. We +further optimize our compressor for PyTorch tensors on GPUs, minimizing +compression overhead. Evaluation shows that our method achieves a 1.38$\times$ +training speedup with a minimal accuracy impact. + +
+
+ comment: accepted by SC '24 +
+
+
+
+
+ + ☆ Variational Partial Group Convolutions for Input-Aware Partial + Equivariance of Rotations and Color-Shifts ICML2024 + + +
+ Group Equivariant CNNs (G-CNNs) have shown promising efficacy in various +tasks, owing to their ability to capture hierarchical features in an +equivariant manner. However, their equivariance is fixed to the symmetry of the +whole group, limiting adaptability to diverse partial symmetries in real-world +datasets, such as limited rotation symmetry of handwritten digit images and +limited color-shift symmetry of flower images. Recent efforts address this +limitation, one example being Partial G-CNN which restricts the output group +space of convolution layers to break full equivariance. However, such an +approach still fails to adjust equivariance levels across data. In this paper, +we propose a novel approach, Variational Partial G-CNN (VP G-CNN), to capture +varying levels of partial equivariance specific to each data instance. VP G-CNN +redesigns the distribution of the output group elements to be conditioned on +input data, leveraging variational inference to avoid overfitting. This enables +the model to adjust its equivariance levels according to the needs of +individual data points. Additionally, we address training instability inherent +in discrete group equivariance models by redesigning the reparametrizable +distribution. We demonstrate the effectiveness of VP G-CNN on both toy and +real-world datasets, including MNIST67-180, CIFAR10, ColorMNIST, and +Flowers102. Our results show robust performance, even in uncertainty metrics. + +
+
+ comment: ICML2024 +
+
+
+
+
+ + ☆ NeuFair: Neural Network Fairness Repair with Dropout ISSTA 2024 + + +
+ This paper investigates the neural dropout method as a post-processing bias +mitigation for deep neural networks (DNNs). Neural-driven software solutions +are increasingly applied in socially critical domains with significant fairness +implications. While neural networks are exceptionally good at finding +statistical patterns from data, they are notorious for overfitting to the +training datasets that may encode and amplify existing biases from the +historical data. Existing bias mitigation algorithms often require either +modifying the input dataset or modifying the learning algorithms. We posit that +the prevalent dropout methods that prevent over-fitting during training by +randomly dropping neurons may be an effective and less intrusive approach to +improve fairness of pre-trained DNNs. However, finding the ideal set of neurons +to drop is a combinatorial problem. We propose NeuFair, a family of +post-processing randomized algorithms that mitigate unfairness in pre-trained +DNNs. Our randomized search is guided by an objective to minimize +discrimination while maintaining the model utility. We show that our design of +randomized algorithms provides statistical guarantees on finding optimal +solutions, and we empirically evaluate the efficacy and efficiency of NeuFair +in improving fairness, with minimal or no performance degradation. Our results +show that NeuFair improves fairness by up to 69% and outperforms +state-of-the-art post-processing bias techniques. + +
+
+ comment: Paper accepted at ACM ISSTA 2024 +
+
+
+
+
+ + ☆ Langevin Dynamics: A Unified Perspective on Optimization via Lyapunov + Potentials + + +
+ We study the problem of non-convex optimization using Stochastic Gradient +Langevin Dynamics (SGLD). SGLD is a natural and popular variation of stochastic +gradient descent where at each step, appropriately scaled Gaussian noise is +added. To our knowledge, the only strategy for showing global convergence of +SGLD on the loss function is to show that SGLD can sample from a stationary +distribution which assigns larger mass when the function is small (the Gibbs +measure), and then to convert these guarantees to optimization results. + We employ a new strategy to analyze the convergence of SGLD to global minima, +based on Lyapunov potentials and optimization. We convert the same mild +conditions from previous works on SGLD into geometric properties based on +Lyapunov potentials. This adapts well to the case with a stochastic gradient +oracle, which is natural for machine learning applications where one wants to +minimize population loss but only has access to stochastic gradients via +minibatch training samples. Here we provide 1) improved rates in the setting of +previous works studying SGLD for optimization, 2) the first finite gradient +complexity guarantee for SGLD where the function is Lipschitz and the Gibbs +measure defined by the function satisfies a Poincar\'e Inequality, and 3) prove +if continuous-time Langevin Dynamics succeeds for optimization, then +discrete-time SGLD succeeds under mild regularity assumptions. + +
+
+
+
+
+ + ☆ Robust Q-Learning for finite ambiguity sets + + +
+ In this paper we propose a novel $Q$-learning algorithm allowing to solve +distributionally robust Markov decision problems for which the ambiguity set of +probability measures can be chosen arbitrarily as long as it comprises only a +finite amount of measures. Therefore, our approach goes beyond the well-studied +cases involving ambiguity sets of balls around some reference measure with the +distance to reference measure being measured with respect to the Wasserstein +distance or the Kullback--Leibler divergence. Hence, our approach allows the +applicant to create ambiguity sets better tailored to her needs and to solve +the associated robust Markov decision problem via a $Q$-learning algorithm +whose convergence is guaranteed by our main result. Moreover, we showcase in +several numerical experiments the tractability of our approach. + +
+
+
+
+
+ + ☆ Unsupervised Video Summarization via Reinforcement Learning and a + Trained Evaluator + + +
+ This paper presents a novel approach for unsupervised video summarization +using reinforcement learning. It aims to address the existing limitations of +current unsupervised methods, including unstable training of adversarial +generator-discriminator architectures and reliance on hand-crafted reward +functions for quality evaluation. The proposed method is based on the concept +that a concise and informative summary should result in a reconstructed video +that closely resembles the original. The summarizer model assigns an importance +score to each frame and generates a video summary. In the proposed scheme, +reinforcement learning, coupled with a unique reward generation pipeline, is +employed to train the summarizer model. The reward generation pipeline trains +the summarizer to create summaries that lead to improved reconstructions. It +comprises a generator model capable of reconstructing masked frames from a +partially masked video, along with a reward mechanism that compares the +reconstructed video from the summary against the original. The video generator +is trained in a self-supervised manner to reconstruct randomly masked frames, +enhancing its ability to generate accurate summaries. This training pipeline +results in a summarizer model that better mimics human-generated video +summaries compared to methods relying on hand-crafted rewards. The training +process consists of two stable and isolated training steps, unlike adversarial +architectures. Experimental results demonstrate promising performance, with +F-scores of 62.3 and 54.5 on TVSum and SumMe datasets, respectively. +Additionally, the inference stage is 300 times faster than our previously +reported state-of-the-art method. + +
+
+
+
+
+ + ☆ Unified Interpretation of Smoothing Methods for Negative Sampling Loss + Functions in Knowledge Graph Embedding RepL4NLP + + +
+ Knowledge Graphs (KGs) are fundamental resources in knowledge-intensive tasks +in NLP. Due to the limitation of manually creating KGs, KG Completion (KGC) has +an important role in automatically completing KGs by scoring their links with +KG Embedding (KGE). To handle many entities in training, KGE relies on Negative +Sampling (NS) loss that can reduce the computational cost by sampling. Since +the appearance frequencies for each link are at most one in KGs, sparsity is an +essential and inevitable problem. The NS loss is no exception. As a solution, +the NS loss in KGE relies on smoothing methods like Self-Adversarial Negative +Sampling (SANS) and subsampling. However, it is uncertain what kind of +smoothing method is suitable for this purpose due to the lack of theoretical +understanding. This paper provides theoretical interpretations of the smoothing +methods for the NS loss in KGE and induces a new NS loss, Triplet Adaptive +Negative Sampling (TANS), that can cover the characteristics of the +conventional smoothing methods. Experimental results of TransE, DistMult, +ComplEx, RotatE, HAKE, and HousE on FB15k-237, WN18RR, and YAGO3-10 datasets +and their sparser subsets show the soundness of our interpretation and +performance improvement by our TANS. + +
+
+ comment: 9 pages, 4 figures, 2 tables; accepted to workshop RepL4NLP held in + conjunction with ACL 2024 +
+
+
+
+
+ + ☆ Machine Learning for Complex Systems with Abnormal Pattern by Exception + Maximization Outlier Detection Method + + +
+ This paper proposes a novel fast online methodology for outlier detection +called the exception maximization outlier detection method(EMODM), which +employs probabilistic models and statistical algorithms to detect abnormal +patterns from the outputs of complex systems. The EMODM is based on a two-state +Gaussian mixture model and demonstrates strong performance in probability +anomaly detection working on real-time raw data rather than using special prior +distribution information. We confirm this using the synthetic data from two +numerical cases. For the real-world data, we have detected the short circuit +pattern of the circuit system using EMODM by the current and voltage output of +a three-phase inverter. The EMODM also found an abnormal period due to COVID-19 +in the insured unemployment data of 53 regions in the United States from 2000 +to 2024. The application of EMODM to these two real-life datasets demonstrated +the effectiveness and accuracy of our algorithm. + +
+
+
+
+
+ + ☆ A Two-Step Minimax Q-learning Algorithm for Two-Player Zero-Sum Markov + Games + + +
+ An interesting iterative procedure is proposed to solve a two-player zero-sum +Markov games. First this problem is expressed as a min-max Markov game. Next, a +two-step Q-learning algorithm for solving Markov decision problem (MDP) is +suitably modified to solve this Markov game. Under a suitable assumption, the +boundedness of the proposed iterates is obtained theoretically. Using results +from stochastic approximation, the almost sure convergence of the proposed +two-step minimax Q-learning is obtained theoretically. More specifically, the +proposed algorithm converges to the game theoretic optimal value with +probability one, when the model information is not known. Numerical simulation +authenticate that the proposed algorithm is effective and easy to implement. + +
+
+
+
+
+ + ☆ Graph Pooling via Ricci Flow + + +
+ Graph Machine Learning often involves the clustering of nodes based on +similarity structure encoded in the graph's topology and the nodes' attributes. +On homophilous graphs, the integration of pooling layers has been shown to +enhance the performance of Graph Neural Networks by accounting for inherent +multi-scale structure. Here, similar nodes are grouped together to coarsen the +graph and reduce the input size in subsequent layers in deeper architectures. +In both settings, the underlying clustering approach can be implemented via +graph pooling operators, which often rely on classical tools from Graph Theory. +In this work, we introduce a graph pooling operator (ORC-Pool), which utilizes +a characterization of the graph's geometry via Ollivier's discrete Ricci +curvature and an associated geometric flow. Previous Ricci flow based +clustering approaches have shown great promise across several domains, but are +by construction unable to account for similarity structure encoded in the node +attributes. However, in many ML applications, such information is vital for +downstream tasks. ORC-Pool extends such clustering approaches to attributed +graphs, allowing for the integration of geometric coarsening into Graph Neural +Networks as a pooling layer. + +
+
+ comment: 32 pages, 7 figures +
+
+
+
+
+ + ☆ TimeLDM: Latent Diffusion Model for Unconditional Time Series Generation + + +
+ Time series generation is a crucial research topic in the area of deep +learning, which can be used for data augmentation, imputing missing values, and +forecasting. Currently, latent diffusion models are ascending to the forefront +of generative modeling for many important data representations. Being the most +pivotal in the computer vision domain, latent diffusion models have also +recently attracted interest in other communities, including NLP, Speech, and +Geometric Space. In this work, we propose TimeLDM, a novel latent diffusion +model for high-quality time series generation. TimeLDM is composed of a +variational autoencoder that encodes time series into an informative and +smoothed latent content and a latent diffusion model operating in the latent +space to generate latent information. We evaluate the ability of our method to +generate synthetic time series with simulated and realistic datasets, benchmark +the performance against existing state-of-the-art methods. Qualitatively and +quantitatively, we find that the proposed TimeLDM persistently delivers +high-quality generated time series. Sores from Context-FID and Discriminative +indicate that TimeLDM consistently and significantly outperforms current +state-of-the-art benchmarks with an average improvement of 3.4$\times$ and +3.8$\times$, respectively. Further studies demonstrate that our method presents +better performance on different lengths of time series data generation. To the +best of our knowledge, this is the first study to explore the potential of the +latent diffusion model for unconditional time series generation and establish a +new baseline for synthetic time series. + +
+
+
+
+
+ + ♻ ☆ Generative Camera Dolly: Extreme Monocular Dynamic Novel View Synthesis ECCV 2024 + + +
+ Accurate reconstruction of complex dynamic scenes from just a single +viewpoint continues to be a challenging task in computer vision. Current +dynamic novel view synthesis methods typically require videos from many +different camera viewpoints, necessitating careful recording setups, and +significantly restricting their utility in the wild as well as in terms of +embodied AI applications. In this paper, we propose $\textbf{GCD}$, a +controllable monocular dynamic view synthesis pipeline that leverages +large-scale diffusion priors to, given a video of any scene, generate a +synchronous video from any other chosen perspective, conditioned on a set of +relative camera pose parameters. Our model does not require depth as input, and +does not explicitly model 3D scene geometry, instead performing end-to-end +video-to-video translation in order to achieve its goal efficiently. Despite +being trained on synthetic multi-view video data only, zero-shot real-world +generalization experiments show promising results in multiple domains, +including robotics, object permanence, and driving environments. We believe our +framework can potentially unlock powerful applications in rich dynamic scene +understanding, perception for robotics, and interactive 3D video viewing +experiences for virtual reality. + +
+
+ comment: Accepted to ECCV 2024. Project webpage is available at: + https://gcd.cs.columbia.edu/ +
+
+
+
+
+ + ♻ ☆ Research on target detection method of distracted driving behavior based + on improved YOLOv8 + + +
+ With the development of deep learning technology, the detection and +classification of distracted driving behaviour requires higher accuracy. +Existing deep learning-based methods are computationally intensive and +parameter redundant, limiting the efficiency and accuracy in practical +applications. To solve this problem, this study proposes an improved YOLOv8 +detection method based on the original YOLOv8 model by integrating the BoTNet +module, GAM attention mechanism and EIoU loss function. By optimising the +feature extraction and multi-scale feature fusion strategies, the training and +inference processes are simplified, and the detection accuracy and efficiency +are significantly improved. Experimental results show that the improved model +performs well in both detection speed and accuracy, with an accuracy rate of +99.4%, and the model is smaller and easy to deploy, which is able to identify +and classify distracted driving behaviours in real time, provide timely +warnings, and enhance driving safety. + +
+
+ comment: Major revision on content, no replacement available soon +
+
+
+
+
+ + ♻ ☆ Z-Splat: Z-Axis Gaussian Splatting for Camera-Sonar Fusion + + +
+ Differentiable 3D-Gaussian splatting (GS) is emerging as a prominent +technique in computer vision and graphics for reconstructing 3D scenes. GS +represents a scene as a set of 3D Gaussians with varying opacities and employs +a computationally efficient splatting operation along with analytical +derivatives to compute the 3D Gaussian parameters given scene images captured +from various viewpoints. Unfortunately, capturing surround view ($360^{\circ}$ +viewpoint) images is impossible or impractical in many real-world imaging +scenarios, including underwater imaging, rooms inside a building, and +autonomous navigation. In these restricted baseline imaging scenarios, the GS +algorithm suffers from a well-known 'missing cone' problem, which results in +poor reconstruction along the depth axis. In this manuscript, we demonstrate +that using transient data (from sonars) allows us to address the missing cone +problem by sampling high-frequency data along the depth axis. We extend the +Gaussian splatting algorithms for two commonly used sonars and propose fusion +algorithms that simultaneously utilize RGB camera data and sonar data. Through +simulations, emulations, and hardware experiments across various imaging +scenarios, we show that the proposed fusion algorithms lead to significantly +better novel view synthesis (5 dB improvement in PSNR) and 3D geometry +reconstruction (60% lower Chamfer distance). + +
+
+
+
+
+ + ♻ ☆ OpenDebateEvidence: A Massive-Scale Argument Mining and Summarization + Dataset ACL2024 + + +
+ We introduce OpenDebateEvidence, a comprehensive dataset for argument mining +and summarization sourced from the American Competitive Debate community. This +dataset includes over 3.5 million documents with rich metadata, making it one +of the most extensive collections of debate evidence. OpenDebateEvidence +captures the complexity of arguments in high school and college debates, +providing valuable resources for training and evaluation. Our extensive +experiments demonstrate the efficacy of fine-tuning state-of-the-art large +language models for argumentative abstractive summarization across various +methods, models, and datasets. By providing this comprehensive resource, we aim +to advance computational argumentation and support practical applications for +debaters, educators, and researchers. OpenDebateEvidence is publicly available +to support further research and innovation in computational argumentation. +Access it here: https://huggingface.co/datasets/Yusuf5/OpenCaselist + +
+
+ comment: Accepted for Publication to ARGMIN 2024 at ACL2024 +
+
+
+
+
+ + ♻ ☆ Improving Low-Resource Knowledge Tracing Tasks by Supervised + Pre-training and Importance Mechanism Fine-tuning + + +
+ Knowledge tracing (KT) aims to estimate student's knowledge mastery based on +their historical interactions. Recently, the deep learning based KT (DLKT) +approaches have achieved impressive performance in the KT task. These DLKT +models heavily rely on the large number of available student interactions. +However, due to various reasons such as budget constraints and privacy +concerns, observed interactions are very limited in many real-world scenarios, +a.k.a, low-resource KT datasets. Directly training a DLKT model on a +low-resource KT dataset may lead to overfitting and it is difficult to choose +the appropriate deep neural architecture. Therefore, in this paper, we propose +a low-resource KT framework called LoReKT to address above challenges. Inspired +by the prevalent "pre-training and fine-tuning" paradigm, we aim to learn +transferable parameters and representations from rich-resource KT datasets +during the pre-training stage and subsequently facilitate effective adaptation +to low-resource KT datasets. Specifically, we simplify existing sophisticated +DLKT model architectures with purely a stack of transformer decoders. We design +an encoding mechanism to incorporate student interactions from multiple KT data +sources and develop an importance mechanism to prioritize updating parameters +with high importance while constraining less important ones during the +fine-tuning stage. We evaluate LoReKT on six public KT datasets and +experimental results demonstrate the superiority of our approach in terms of +AUC and Accuracy. To encourage reproducible research, we make our data and code +publicly available at https://anonymous.4open.science/r/LoReKT-C619. + +
+
+ comment: 29 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Accelerated Parameter-Free Stochastic Optimization + + +
+ We propose a method that achieves near-optimal rates for smooth stochastic +convex optimization and requires essentially no prior knowledge of problem +parameters. This improves on prior work which requires knowing at least the +initial distance to optimality d0. Our method, U-DoG, combines UniXGrad (Kavis +et al., 2019) and DoG (Ivgi et al., 2023) with novel iterate stabilization +techniques. It requires only loose bounds on d0 and the noise magnitude, +provides high probability guarantees under sub-Gaussian noise, and is also +near-optimal in the non-smooth case. Our experiments show consistent, strong +performance on convex problems and mixed results on neural network training. + +
+
+
+
+
+ + ♻ ☆ A Question-centric Multi-experts Contrastive Learning Framework for + Improving the Accuracy and Interpretability of Deep Sequential Knowledge + Tracing Models KDD + + +
+ Knowledge tracing (KT) plays a crucial role in predicting students' future +performance by analyzing their historical learning processes. Deep neural +networks (DNNs) have shown great potential in solving the KT problem. However, +there still exist some important challenges when applying deep learning +techniques to model the KT process. The first challenge lies in taking the +individual information of the question into modeling. This is crucial because, +despite questions sharing the same knowledge component (KC), students' +knowledge acquisition on homogeneous questions can vary significantly. The +second challenge lies in interpreting the prediction results from existing deep +learning-based KT models. In real-world applications, while it may not be +necessary to have complete transparency and interpretability of the model +parameters, it is crucial to present the model's prediction results in a manner +that teachers find interpretable. This makes teachers accept the rationale +behind the prediction results and utilize them to design teaching activities +and tailored learning strategies for students. However, the inherent black-box +nature of deep learning techniques often poses a hurdle for teachers to fully +embrace the model's prediction results. To address these challenges, we propose +a Question-centric Multi-experts Contrastive Learning framework for KT called +Q-MCKT. We have provided all the datasets and code on our website at +https://github.com/rattlesnakey/Q-MCKT. + +
+
+ comment: 25 pages, 9 figures, Accepted by TKDD +
+
+
+
+
+ + ♻ ☆ Probabilistic Rank and Reward: A Scalable Model for Slate Recommendation + + +
+ We introduce Probabilistic Rank and Reward (PRR), a scalable probabilistic +model for personalized slate recommendation. Our approach allows off-policy +estimation of the reward in the scenario where the user interacts with at most +one item from a slate of K items. We show that the probability of a slate being +successful can be learned efficiently by combining the reward, whether the user +successfully interacted with the slate, and the rank, the item that was +selected within the slate. PRR outperforms existing off-policy reward +optimizing methods and is far more scalable to large action spaces. Moreover, +PRR allows fast delivery of recommendations powered by maximum inner product +search (MIPS), making it suitable in low latency domains such as computational +advertising. + +
+
+
+
+
+ + ♻ ☆ DexDiffuser: Generating Dexterous Grasps with Diffusion Models + + +
+ We introduce DexDiffuser, a novel dexterous grasping method that generates, +evaluates, and refines grasps on partial object point clouds. DexDiffuser +includes the conditional diffusion-based grasp sampler DexSampler and the +dexterous grasp evaluator DexEvaluator. DexSampler generates high-quality +grasps conditioned on object point clouds by iterative denoising of randomly +sampled grasps. We also introduce two grasp refinement strategies: +Evaluator-Guided Diffusion (EGD) and Evaluator-based Sampling Refinement (ESR). +The experiment results demonstrate that DexDiffuser consistently outperforms +the state-of-the-art multi-finger grasp generation method FFHNet with an, on +average, 9.12% and 19.44% higher grasp success rate in simulation and real +robot experiments, respectively. Supplementary materials are available at +https://yulihn.github.io/DexDiffuser_page/ + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Deep-Learning-Based Channel Estimation for Distributed MIMO with 1-bit + Radio-Over-Fiber Fronthaul + + +
+ We consider the problem of pilot-aided, uplink channel estimation in a +distributed massive multiple-input multiple-output (MIMO) architecture, in +which the access points are connected to a central processing unit via +fiber-optical fronthaul links, carrying a two-level-quantized version of the +received analog radio-frequency signal. We adapt to this architecture the +deep-learning-based channel-estimation algorithm recently proposed by Nguyen et +al. (2023), and explore its robustness to the additional signal distortions +(beyond 1-bit quantization) introduced in the considered architecture by the +automatic gain controllers (AGCs) and by the comparators. These components are +used at the access points to generate the two-level analog waveform from the +received signal. Via simulation results, we illustrate that the proposed +channel-estimation method outperforms significantly the Bussgang linear minimum +mean-square error channel estimator, and it is robust against the additional +impairments introduced by the AGCs and the comparators. + +
+
+
+
+
+ + ♻ ☆ FakET: Simulating Cryo-Electron Tomograms with Neural Style Transfer + + +
+ In cryo-electron microscopy, accurate particle localization and +classification are imperative. Recent deep learning solutions, though +successful, require extensive training data sets. The protracted generation +time of physics-based models, often employed to produce these data sets, limits +their broad applicability. We introduce FakET, a method based on Neural Style +Transfer, capable of simulating the forward operator of any cryo transmission +electron microscope. It can be used to adapt a synthetic training data set +according to reference data producing high-quality simulated micrographs or +tilt-series. To assess the quality of our generated data, we used it to train a +state-of-the-art localization and classification architecture and compared its +performance with a counterpart trained on benchmark data. Remarkably, our +technique matches the performance, boosts data generation speed 750 times, uses +33 times less memory, and scales well to typical transmission electron +microscope detector sizes. It leverages GPU acceleration and parallel +processing. The source code is available at https://github.com/paloha/faket. + +
+
+ comment: 25 pages, 3 tables, 19 figures including supplement. Added Key + findings section, CPU-profiling appendix, and Supplementary information +
+
+
+
+
+ + ♻ ☆ Multimodal Variational Autoencoder for Low-cost Cardiac Hemodynamics + Instability Detection + + +
+ Recent advancements in non-invasive detection of cardiac hemodynamic +instability (CHDI) primarily focus on applying machine learning techniques to a +single data modality, e.g. cardiac magnetic resonance imaging (MRI). Despite +their potential, these approaches often fall short especially when the size of +labeled patient data is limited, a common challenge in the medical domain. +Furthermore, only a few studies have explored multimodal methods to study CHDI, +which mostly rely on costly modalities such as cardiac MRI and echocardiogram. +In response to these limitations, we propose a novel multimodal variational +autoencoder ($\text{CardioVAE}_\text{X,G}$) to integrate low-cost chest X-ray +(CXR) and electrocardiogram (ECG) modalities with pre-training on a large +unlabeled dataset. Specifically, $\text{CardioVAE}_\text{X,G}$ introduces a +novel tri-stream pre-training strategy to learn both shared and +modality-specific features, thus enabling fine-tuning with both unimodal and +multimodal datasets. We pre-train $\text{CardioVAE}_\text{X,G}$ on a large, +unlabeled dataset of $50,982$ subjects from a subset of MIMIC database and then +fine-tune the pre-trained model on a labeled dataset of $795$ subjects from the +ASPIRE registry. Comprehensive evaluations against existing methods show that +$\text{CardioVAE}_\text{X,G}$ offers promising performance (AUROC $=0.79$ and +Accuracy $=0.77$), representing a significant step forward in non-invasive +prediction of CHDI. Our model also excels in producing fine interpretations of +predictions directly associated with clinical features, thereby supporting +clinical decision-making. + +
+
+
+
+
+ + ♻ ☆ From Representational Harms to Quality-of-Service Harms: A Case Study on + Llama 2 Safety Safeguards ACL 2024 + + +
+ Recent progress in large language models (LLMs) has led to their widespread +adoption in various domains. However, these advancements have also introduced +additional safety risks and raised concerns regarding their detrimental impact +on already marginalized populations. Despite growing mitigation efforts to +develop safety safeguards, such as supervised safety-oriented fine-tuning and +leveraging safe reinforcement learning from human feedback, multiple concerns +regarding the safety and ingrained biases in these models remain. Furthermore, +previous work has demonstrated that models optimized for safety often display +exaggerated safety behaviors, such as a tendency to refrain from responding to +certain requests as a precautionary measure. As such, a clear trade-off between +the helpfulness and safety of these models has been documented in the +literature. In this paper, we further investigate the effectiveness of safety +measures by evaluating models on already mitigated biases. Using the case of +Llama 2 as an example, we illustrate how LLMs' safety responses can still +encode harmful assumptions. To do so, we create a set of non-toxic prompts, +which we then use to evaluate Llama models. Through our new taxonomy of LLMs +responses to users, we observe that the safety/helpfulness trade-offs are more +pronounced for certain demographic groups which can lead to quality-of-service +harms for marginalized populations. + +
+
+ comment: 9 pages, 4 figures. Accepted to Findings of the Association for + Computational Linguistics: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Planetary Causal Inference: Implications for the Geography of Poverty + + +
+ Earth observation data such as satellite imagery can, when combined with +machine learning, can have far-reaching impacts on our understanding of the +geography of poverty through the prediction of living conditions, especially +where government-derived economic indicators are either unavailable or +potentially untrustworthy. Recent work has progressed in using Earth +Observation (EO) data not only to predict spatial economic outcomes but also to +explore cause and effect, an understanding which is critical for downstream +policy analysis. In this review, we first document the growth of interest in +using satellite images together with EO data in causal analysis. We then trace +the relationship between spatial statistics and machine learning methods before +discussing four ways in which EO data has been used in causal machine learning +pipelines -- (1.) poverty outcome imputation for downstream causal analysis, +(2.) EO image deconfounding, (3.) EO-based treatment effect heterogeneity, and +(4.) EO-based transportability analysis. We conclude by providing a +step-by-step workflow for how researchers can incorporate EO data in causal ML +analysis going forward, outlining major choices of data, models, and evaluation +metrics. + +
+
+ comment: For a full list of the papers found in the quantitative literature + search, see https://github.com/AIandGlobalDevelopmentLab/eo-poverty-review +
+
+
+
+
+ + ♻ ☆ Steering Llama 2 via Contrastive Activation Addition + + +
+ We introduce Contrastive Activation Addition (CAA), an innovative method for +steering language models by modifying their activations during forward passes. +CAA computes "steering vectors" by averaging the difference in residual stream +activations between pairs of positive and negative examples of a particular +behavior, such as factual versus hallucinatory responses. During inference, +these steering vectors are added at all token positions after the user's prompt +with either a positive or negative coefficient, allowing precise control over +the degree of the targeted behavior. We evaluate CAA's effectiveness on Llama 2 +Chat using multiple-choice behavioral question datasets and open-ended +generation tasks. We demonstrate that CAA significantly alters model behavior, +is effective over and on top of traditional methods like finetuning and system +prompt design, and minimally reduces capabilities. Moreover, we gain deeper +insights into CAA's mechanisms by employing various activation space +interpretation methods. CAA accurately steers model outputs and sheds light on +how high-level concepts are represented in Large Language Models (LLMs). + +
+
+
+
+
+ + ♻ ☆ Certifiable Black-Box Attacks with Randomized Adversarial Examples: + Breaking Defenses with Provable Confidence + + +
+ Black-box adversarial attacks have shown strong potential to subvert machine +learning models. Existing black-box attacks craft adversarial examples by +iteratively querying the target model and/or leveraging the transferability of +a local surrogate model. Recently, such attacks can be effectively mitigated by +state-of-the-art (SOTA) defenses, e.g., detection via the pattern of sequential +queries, or injecting noise into the model. To our best knowledge, we take the +first step to study a new paradigm of black-box attacks with provable +guarantees -- certifiable black-box attacks that can guarantee the attack +success probability (ASP) of adversarial examples before querying over the +target model. This new black-box attack unveils significant vulnerabilities of +machine learning models, compared to traditional empirical black-box attacks, +e.g., breaking strong SOTA defenses with provable confidence, constructing a +space of (infinite) adversarial examples with high ASP, and the ASP of the +generated adversarial examples is theoretically guaranteed without +verification/queries over the target model. Specifically, we establish a novel +theoretical foundation for ensuring the ASP of the black-box attack with +randomized adversarial examples (AEs). Then, we propose several novel +techniques to craft the randomized AEs while reducing the perturbation size for +better imperceptibility. Finally, we have comprehensively evaluated the +certifiable black-box attacks on the CIFAR10/100, ImageNet, and LibriSpeech +datasets, while benchmarking with 16 SOTA empirical black-box attacks, against +various SOTA defenses in the domains of computer vision and speech recognition. +Both theoretical and experimental results have validated the significance of +the proposed attack. + +
+
+
+
+
+ + ♻ ☆ Low-Resource Crop Classification from Multi-Spectral Time Series Using + Lossless Compressors + + +
+ Deep learning has significantly improved the accuracy of crop classification +using multispectral temporal data. However, these models have complex +structures with numerous parameters, requiring large amounts of data and costly +training. In low-resource situations with fewer labeled samples, deep learning +models perform poorly due to insufficient data. Conversely, compressors are +data-type agnostic, and non-parametric methods do not bring underlying +assumptions. Inspired by this insight, we propose a non-training alternative to +deep learning models, aiming to address these situations. Specifically, the +Symbolic Representation Module is proposed to convert the reflectivity into +symbolic representations. The symbolic representations are then +cross-transformed in both the channel and time dimensions to generate symbolic +embeddings. Next, the Multi-scale Normalised Compression Distance (MNCD) is +designed to measure the correlation between any two symbolic embeddings. +Finally, based on the MNCDs, high quality crop classification can be achieved +using only a k-nearest-neighbor classifier kNN. The entire framework is +ready-to-use and lightweight. Without any training, it outperformed, on +average, 7 advanced deep learning models trained at scale on three benchmark +datasets. It also outperforms more than half of these models in the few-shot +setting with sparse crop labels. Therefore, the high performance and robustness +of our non-training framework makes it truly applicable to real-world crop +mapping. Codes are available at: +https://github.com/qinfengsama/Compressor-Based-Crop-Mapping. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Sampling from the Mean-Field Stationary Distribution + + +
+ We study the complexity of sampling from the stationary distribution of a +mean-field SDE, or equivalently, the complexity of minimizing a functional over +the space of probability measures which includes an interaction term. Our main +insight is to decouple the two key aspects of this problem: (1) approximation +of the mean-field SDE via a finite-particle system, via uniform-in-time +propagation of chaos, and (2) sampling from the finite-particle stationary +distribution, via standard log-concave samplers. Our approach is conceptually +simpler and its flexibility allows for incorporating the state-of-the-art for +both algorithms and theory. This leads to improved guarantees in numerous +settings, including better guarantees for optimizing certain two-layer neural +networks in the mean-field regime. A key technical contribution is to establish +a new uniform-in-$N$ log-Sobolev inequality for the stationary distribution of +the mean-field Langevin dynamics. + +
+
+
+
+
+ + ♻ ☆ Artwork Protection Against Neural Style Transfer Using Locally Adaptive + Adversarial Color Attack + + +
+ Neural style transfer (NST) generates new images by combining the style of +one image with the content of another. However, unauthorized NST can exploit +artwork, raising concerns about artists' rights and motivating the development +of proactive protection methods. We propose Locally Adaptive Adversarial Color +Attack (LAACA), empowering artists to protect their artwork from unauthorized +style transfer by processing before public release. By delving into the +intricacies of human visual perception and the role of different frequency +components, our method strategically introduces frequency-adaptive +perturbations in the image. These perturbations significantly degrade the +generation quality of NST while maintaining an acceptable level of visual +change in the original image, ensuring that potential infringers are +discouraged from using the protected artworks, because of its bad NST +generation quality. Additionally, existing metrics often overlook the +importance of color fidelity in evaluating color-mattered tasks, such as the +quality of NST-generated images, which is crucial in the context of artistic +works. To comprehensively assess the color-mattered tasks, we propose the +Adversarial Color Distance Metric (ACDM), designed to quantify the color +difference of images pre- and post-manipulations. Experimental results confirm +that attacking NST using LAACA results in visually inferior style transfer, and +the ACDM can efficiently measure color-mattered tasks. By providing artists +with a tool to safeguard their intellectual property, our work relieves the +socio-technical challenges posed by the misuse of NST in the art community. + +
+
+ comment: 9 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Turbulence Scaling from Deep Learning Diffusion Generative Models + + +
+ Complex spatial and temporal structures are inherent characteristics of +turbulent fluid flows and comprehending them poses a major challenge. This +comprehesion necessitates an understanding of the space of turbulent fluid flow +configurations. We employ a diffusion-based generative model to learn the +distribution of turbulent vorticity profiles and generate snapshots of +turbulent solutions to the incompressible Navier-Stokes equations. We consider +the inverse cascade in two spatial dimensions and generate diverse turbulent +solutions that differ from those in the training dataset. We analyze the +statistical scaling properties of the new turbulent profiles, calculate their +structure functions, energy power spectrum, velocity probability distribution +function and moments of local energy dissipation. All the learnt scaling +exponents are consistent with the expected Kolmogorov scaling. This agreement +with established turbulence characteristics provides strong evidence of the +model's capability to capture essential features of real-world turbulence. + +
+
+
+
+
+ + ♻ ☆ Query-Policy Misalignment in Preference-Based Reinforcement Learning ICLR 2024 + + +
+ Preference-based reinforcement learning (PbRL) provides a natural way to +align RL agents' behavior with human desired outcomes, but is often restrained +by costly human feedback. To improve feedback efficiency, most existing PbRL +methods focus on selecting queries to maximally improve the overall quality of +the reward model, but counter-intuitively, we find that this may not +necessarily lead to improved performance. To unravel this mystery, we identify +a long-neglected issue in the query selection schemes of existing PbRL studies: +Query-Policy Misalignment. We show that the seemingly informative queries +selected to improve the overall quality of reward model actually may not align +with RL agents' interests, thus offering little help on policy learning and +eventually resulting in poor feedback efficiency. We show that this issue can +be effectively addressed via near on-policy query and a specially designed +hybrid experience replay, which together enforce the bidirectional query-policy +alignment. Simple yet elegant, our method can be easily incorporated into +existing approaches by changing only a few lines of code. We showcase in +comprehensive experiments that our method achieves substantial gains in both +human feedback and RL sample efficiency, demonstrating the importance of +addressing query-policy misalignment in PbRL tasks. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Distributed Black-box Attack: Do Not Overestimate Black-box Attacks + + +
+ Black-box adversarial attacks can fool image classifiers into misclassifying +images without requiring access to model structure and weights. Recent studies +have reported attack success rates of over 95% with less than 1,000 queries. +The question then arises of whether black-box attacks have become a real threat +against IoT devices that rely on cloud APIs to achieve image classification. To +shed some light on this, note that prior research has primarily focused on +increasing the success rate and reducing the number of queries. However, +another crucial factor for black-box attacks against cloud APIs is the time +required to perform the attack. This paper applies black-box attacks directly +to cloud APIs rather than to local models, thereby avoiding mistakes made in +prior research that applied the perturbation before image encoding and +pre-processing. Further, we exploit load balancing to enable distributed +black-box attacks that can reduce the attack time by a factor of about five for +both local search and gradient estimation methods. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Age Aware Scheduling for Differentially-Private Federated Learning + + +
+ This paper explores differentially-private federated learning (FL) across +time-varying databases, delving into a nuanced three-way tradeoff involving +age, accuracy, and differential privacy (DP). Emphasizing the potential +advantages of scheduling, we propose an optimization problem aimed at meeting +DP requirements while minimizing the loss difference between the aggregated +model and the model obtained without DP constraints. To harness the benefits of +scheduling, we introduce an age-dependent upper bound on the loss, leading to +the development of an age-aware scheduling design. Simulation results +underscore the superior performance of our proposed scheme compared to FL with +classic DP, which does not consider scheduling as a design factor. This +research contributes insights into the interplay of age, accuracy, and DP in +federated learning, with practical implications for scheduling strategies. + +
+
+ comment: Simulation parameters updated. Paper accepted for presentation at the + 2024 IEEE International Symposium on Information Theory (ISIT 2024) +
+
+
+
+
+ + ♻ ☆ Learning to Generate All Feasible Actions + + +
+ Modern cyber-physical systems are becoming increasingly complex to model, +thus motivating data-driven techniques such as reinforcement learning (RL) to +find appropriate control agents. However, most systems are subject to hard +constraints such as safety or operational bounds. Typically, to learn to +satisfy these constraints, the agent must violate them systematically, which is +computationally prohibitive in most systems. Recent efforts aim to utilize +feasibility models that assess whether a proposed action is feasible to avoid +applying the agent's infeasible action proposals to the system. However, these +efforts focus on guaranteeing constraint satisfaction rather than the agent's +learning efficiency. To improve the learning process, we introduce action +mapping, a novel approach that divides the learning process into two steps: +first learn feasibility and subsequently, the objective by mapping actions into +the sets of feasible actions. This paper focuses on the feasibility part by +learning to generate all feasible actions through self-supervised querying of +the feasibility model. We train the agent by formulating the problem as a +distribution matching problem and deriving gradient estimators for different +divergences. Through an illustrative example, a robotic path planning scenario, +and a robotic grasping simulation, we demonstrate the agent's proficiency in +generating actions across disconnected feasible action sets. By addressing the +feasibility step, this paper makes it possible to focus future work on the +objective part of action mapping, paving the way for an RL framework that is +both safe and efficient. + +
+
+
+
+
+ + ♻ ☆ Which algorithm to select in sports timetabling? + + +
+ Any sports competition needs a timetable, specifying when and where teams +meet each other. The recent International Timetabling Competition (ITC2021) on +sports timetabling showed that, although it is possible to develop general +algorithms, the performance of each algorithm varies considerably over the +problem instances. This paper provides an instance space analysis for sports +timetabling, resulting in powerful insights into the strengths and weaknesses +of eight state-of-the-art algorithms. Based on machine learning techniques, we +propose an algorithm selection system that predicts which algorithm is likely +to perform best when given the characteristics of a sports timetabling problem +instance. Furthermore, we identify which characteristics are important in +making that prediction, providing insights in the performance of the +algorithms, and suggestions to further improve them. Finally, we assess the +empirical hardness of the instances. Our results are based on large +computational experiments involving about 50 years of CPU time on more than 500 +newly generated problem instances. + +
+
+ comment: This is the peer-reviewed author-version of + https://doi.org/10.1016/j.ejor.2024.06.005, published in the European Journal + of Operational Research. Copyright 2024. This manuscript version is made + available under the LCC-BY-NC-ND 4.0 license + (https://creativecommons.org/licenses/by-nc-nd/4.0/) +
+
+
+
+
+ + ♻ ☆ Estimating Treatment Effects under Recommender Interference: A + Structured Neural Networks Approach + + +
+ Recommender systems are essential for content-sharing platforms by curating +personalized content. To evaluate updates to recommender systems targeting +content creators, platforms frequently rely on creator-side randomized +experiments. The treatment effect measures the change in outcomes when a new +algorithm is implemented compared to the status quo. We show that the standard +difference-in-means estimator can lead to biased estimates due to recommender +interference that arises when treated and control creators compete for +exposure. We propose a "recommender choice model" that describes which item +gets exposed from a pool containing both treated and control items. By +combining a structural choice model with neural networks, this framework +directly models the interference pathway while accounting for rich +viewer-content heterogeneity. We construct a debiased estimator of the +treatment effect and prove it is $\sqrt n$-consistent and asymptotically normal +with potentially correlated samples. We validate our estimator's empirical +performance with a field experiment on Weixin short-video platform. In addition +to the standard creator-side experiment, we conduct a costly double-sided +randomization design to obtain a benchmark estimate free from interference +bias. We show that the proposed estimator yields results comparable to the +benchmark, whereas the standard difference-in-means estimator can exhibit +significant bias and even produce reversed signs. + +
+
+
+
+
+ + ♻ ☆ The False Dawn: Reevaluating Google's Reinforcement Learning for Chip + Macro Placement + + +
+ Reinforcement learning (RL) for physical design of silicon chips in a Google +2021 Nature paper stirred controversy due to poorly documented claims that +raised eyebrows and drew critical media coverage. The paper withheld critical +methodology steps and most inputs needed to reproduce results. Our +meta-analysis shows how two separate evaluations filled in the gaps and +demonstrated that Google RL lags behind (i) human designers, (ii) a well-known +algorithm (Simulated Annealing), and (iii) generally-available commercial +software, while being slower; and in a 2023 open research contest, RL methods +weren't in top 5. Crosschecked data indicate that the integrity of the Nature +paper is substantially undermined owing to errors in conduct, analysis and +reporting. Before publishing, Google rebuffed internal allegations of fraud, +which still stand. We note policy implications and conclusions for chip design. + +
+
+ comment: 15 pages, 1 figure, 4 tables, 83 references +
+
+
+
+
+ + ♻ ☆ Mixed Noise and Posterior Estimation with Conditional DeepGEM + + +
+ Motivated by indirect measurements and applications from nanometrology with a +mixed noise model, we develop a novel algorithm for jointly estimating the +posterior and the noise parameters in Bayesian inverse problems. We propose to +solve the problem by an expectation maximization (EM) algorithm. Based on the +current noise parameters, we learn in the E-step a conditional normalizing flow +that approximates the posterior. In the M-step, we propose to find the noise +parameter updates again by an EM algorithm, which has analytical formulas. We +compare the training of the conditional normalizing flow with the forward and +reverse KL, and show that our model is able to incorporate information from +many measurements, unlike previous approaches. + +
+
+ comment: Published in Machine Learning: Science and Technology +
+
+
+
+
+ + ♻ ☆ Multi-Attention Integrated Deep Learning Frameworks for Enhanced Breast + Cancer Segmentation and Identification + + +
+ Breast cancer poses a profound threat to lives globally, claiming numerous +lives each year. Therefore, timely detection is crucial for early intervention +and improved chances of survival. Accurately diagnosing and classifying breast +tumors using ultrasound images is a persistent challenge in medicine, demanding +cutting-edge solutions for improved treatment strategies. This research +introduces multiattention-enhanced deep learning (DL) frameworks designed for +the classification and segmentation of breast cancer tumors from ultrasound +images. A spatial channel attention mechanism is proposed for segmenting tumors +from ultrasound images, utilizing a novel LinkNet DL framework with an +InceptionResNet backbone. Following this, the paper proposes a deep +convolutional neural network with an integrated multi-attention framework +(DCNNIMAF) to classify the segmented tumor as benign, malignant, or normal. +From experimental results, it is observed that the segmentation model has +recorded an accuracy of 98.1%, with a minimal loss of 0.6%. It has also +achieved high Intersection over Union (IoU) and Dice Coefficient scores of +96.9% and 97.2%, respectively. Similarly, the classification model has attained +an accuracy of 99.2%, with a low loss of 0.31%. Furthermore, the classification +framework has achieved outstanding F1-Score, precision, and recall values of +99.1%, 99.3%, and 99.1%, respectively. By offering a robust framework for early +detection and accurate classification of breast cancer, this proposed work +significantly advances the field of medical image analysis, potentially +improving diagnostic precision and patient outcomes. + +
+
+ comment: 29 pages, 15 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ The impact of data set similarity and diversity on transfer learning + success in time series forecasting + + +
+ Pre-trained models have become pivotal in enhancing the efficiency and +accuracy of time series forecasting on target data sets by leveraging transfer +learning. While benchmarks validate the performance of model generalization on +various target data sets, there is no structured research providing similarity +and diversity measures to explain which characteristics of source and target +data lead to transfer learning success. Our study pioneers in systematically +evaluating the impact of source-target similarity and source diversity on +zero-shot and fine-tuned forecasting outcomes in terms of accuracy, bias, and +uncertainty estimation. We investigate these dynamics using pre-trained neural +networks across five public source datasets, applied to forecasting five target +data sets, including real-world wholesales data. We identify two feature-based +similarity and diversity measures, finding that source-target similarity +reduces forecasting bias, while source diversity improves forecasting accuracy +and uncertainty estimation, but increases the bias. + +
+
+
+
+
+ + ♻ ☆ Extended Flow Matching: a Method of Conditional Generation with + Generalized Continuity Equation + + +
+ The task of conditional generation is one of the most important applications +of generative models, and numerous methods have been developed to date based on +the celebrated flow-based models. However, many flow-based models in use today +are not built to allow one to introduce an explicit inductive bias to how the +conditional distribution to be generated changes with respect to conditions. +This can result in unexpected behavior in the task of style transfer, for +example. In this research, we introduce extended flow matching (EFM), a direct +extension of flow matching that learns a "matrix field" corresponding to the +continuous map from the space of conditions to the space of distributions. We +show that we can introduce inductive bias to the conditional generation through +the matrix field and demonstrate this fact with MMOT-EFM, a version of EFM that +aims to minimize the Dirichlet energy or the sensitivity of the distribution +with respect to conditions. We will present our theory along with experimental +results that support the competitiveness of EFM in conditional generation. + +
+
+ comment: 27 pages, 10 figures, We have corrected an error in our experiment on + COT-FM +
+
+
+
+
+ + ♻ ☆ Autoencoders for Real-Time SUEP Detection + + +
+ Confining dark sectors with pseudo-conformal dynamics can produce Soft +Unclustered Energy Patterns (SUEP), at the Large Hadron Collider: the +production of dark quarks in proton-proton collisions leading to a dark shower +and the high-multiplicity production of dark hadrons. The final experimental +signature is spherically-symmetric energy deposits by an anomalously large +number of soft Standard Model particles with a transverse energy of O(100) MeV. +Assuming Yukawa-like couplings of the scalar portal state, the dominant +production mode is gluon fusion, and the dominant background comes from +multi-jet QCD events. We have developed a deep learning-based Anomaly Detection +technique to reject QCD jets and identify any anomalous signature, including +SUEP, in real-time in the High-Level Trigger system of the Compact Muon +Solenoid experiment at the Large Hadron Collider. A deep convolutional neural +autoencoder network has been trained using QCD events by taking transverse +energy deposits in the inner tracker, electromagnetic calorimeter, and hadron +calorimeter sub-detectors as 3-channel image data. Due to the sparse nature of +the data, only ~0.5% of the total ~300 k image pixels have non-zero values. To +tackle this challenge, a non-standard loss function, the inverse of the +so-called Dice Loss, is exploited. The trained autoencoder with learned spatial +features of QCD jets can detect 40% of the SUEP events, with a QCD event +mistagging rate as low as 2%. The model inference time has been measured using +the Intel CoreTM i5-9600KF processor and found to be ~20 ms, which perfectly +satisfies the High-Level Trigger system's latency of O(100) ms. Given the +virtue of the unsupervised learning of the autoencoders, the trained model can +be applied to any new physics model that predicts an experimental signature +anomalous to QCD jets. + +
+
+ comment: 9 pages, 9 figures, 1 table, 1 equation +
+
+
+
+
+ + ♻ ☆ Remember This Event That Year? Assessing Temporal Information and + Reasoning in Large Language Models + + +
+ Large Language Models (LLMs) are increasingly ubiquitous, yet their ability +to retain and reason about temporal information remains limited, hindering +their application in real-world scenarios where understanding the sequential +nature of events is crucial. Our study experiments with 12 state-of-the-art +models (ranging from 2B to 70B+ parameters) on a novel numerical-temporal +dataset, \textbf{TempUN}, spanning from 10,000 BCE to 2100 CE, to uncover +significant temporal retention and comprehension limitations. We propose six +metrics to assess three learning paradigms to enhance temporal knowledge +acquisition. Our findings reveal that open-source models exhibit knowledge gaps +more frequently, suggesting a trade-off between limited knowledge and incorrect +responses. Additionally, various fine-tuning approaches significantly improved +performance, reducing incorrect outputs and impacting the identification of +'information not available' in the generations. The associated dataset and code +are available at (https://github.com/lingoiitgn/TempUN). + +
+
+
+
+
+ + ♻ ☆ Mixture-of-Subspaces in Low-Rank Adaptation + + +
+ In this paper, we introduce a subspace-inspired Low-Rank Adaptation (LoRA) +method, which is computationally efficient, easy to implement, and readily +applicable to large language, multimodal, and diffusion models. Initially, we +equivalently decompose the weights of LoRA into two subspaces, and find that +simply mixing them can enhance performance. To study such a phenomenon, we +revisit it through a fine-grained subspace lens, showing that such modification +is equivalent to employing a fixed mixer to fuse the subspaces. To be more +flexible, we jointly learn the mixer with the original LoRA weights, and term +the method Mixture-of-Subspaces LoRA (MoSLoRA). MoSLoRA consistently +outperforms LoRA on tasks in different modalities, including commonsense +reasoning, visual instruction tuning, and subject-driven text-to-image +generation, demonstrating its effectiveness and robustness. Codes are available +at https://github.com/wutaiqiang/MoSLoRA. + +
+
+ comment: working in progress +
+
+
+
+
+ + ♻ ☆ Semantically Rich Local Dataset Generation for Explainable AI in + Genomics + + +
+ Black box deep learning models trained on genomic sequences excel at +predicting the outcomes of different gene regulatory mechanisms. Therefore, +interpreting these models may provide novel insights into the underlying +biology, supporting downstream biomedical applications. Due to their +complexity, interpretable surrogate models can only be built for local +explanations (e.g., a single instance). However, accomplishing this requires +generating a dataset in the neighborhood of the input, which must maintain +syntactic similarity to the original data while introducing semantic +variability in the model's predictions. This task is challenging due to the +complex sequence-to-function relationship of DNA. + We propose using Genetic Programming to generate datasets by evolving +perturbations in sequences that contribute to their semantic diversity. Our +custom, domain-guided individual representation effectively constrains +syntactic similarity, and we provide two alternative fitness functions that +promote diversity with no computational effort. Applied to the RNA splicing +domain, our approach quickly achieves good diversity and significantly +outperforms a random baseline in exploring the search space, as shown by our +proof-of-concept, short RNA sequence. Furthermore, we assess its +generalizability and demonstrate scalability to larger sequences, resulting in +a ~30% improvement over the baseline. + +
+
+
+
+
+ + ♻ ☆ Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and + unfairness in dyadic regression models + + +
+ Dyadic regression models, which predict real-valued outcomes for pairs of +entities, are fundamental in many domains (e.g. predicting the rating of a user +to a product in Recommender Systems) and promising and under exploration in +many others (e.g. approximating the adequate dosage of a drug for a patient in +personalized pharmacology). In this work, we demonstrate that non-uniformity in +the observed value distributions of individual entities leads to severely +biased predictions in state-of-the-art models, skewing predictions towards the +average of observed past values for the entity and providing worse-than-random +predictive power in eccentric yet equally important cases. We show that the +usage of global error metrics like Root Mean Squared Error (RMSE) and Mean +Absolute Error (MAE) is insufficient to capture this phenomenon, which we name +eccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as +a new complementary metric that can quantify it in all studied models and +datasets. We also prove the adequateness of EAUC by using naive de-biasing +corrections to demonstrate that a lower model bias correlates with a lower EAUC +and vice-versa. This work contributes a bias-aware evaluation of dyadic +regression models to avoid potential unfairness and risks in critical +real-world applications of such systems. + +
+
+
+
+
+ + ♻ ☆ Doubly Robust Causal Effect Estimation under Networked Interference via + Targeted Learning ICML 2024 + + +
+ Causal effect estimation under networked interference is an important but +challenging problem. Available parametric methods are limited in their model +space, while previous semiparametric methods, e.g., leveraging neural networks +to fit only one single nuisance function, may still encounter misspecification +problems under networked interference without appropriate assumptions on the +data generation process. To mitigate bias stemming from misspecification, we +propose a novel doubly robust causal effect estimator under networked +interference, by adapting the targeted learning technique to the training of +neural networks. Specifically, we generalize the targeted learning technique +into the networked interference setting and establish the condition under which +an estimator achieves double robustness. Based on the condition, we devise an +end-to-end causal effect estimator by transforming the identified theoretical +condition into a targeted loss. Moreover, we provide a theoretical analysis of +our designed estimator, revealing a faster convergence rate compared to a +single nuisance model. Extensive experimental results on two real-world +networks with semisynthetic data demonstrate the effectiveness of our proposed +estimators. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ♻ ☆ Adaptive proximal gradient methods are universal without approximation + + +
+ We show that adaptive proximal gradient methods for convex problems are not +restricted to traditional Lipschitzian assumptions. Our analysis reveals that a +class of linesearch-free methods is still convergent under mere local H\"older +gradient continuity, covering in particular continuously differentiable +semi-algebraic functions. To mitigate the lack of local Lipschitz continuity, +popular approaches revolve around $\varepsilon$-oracles and/or linesearch +procedures. In contrast, we exploit plain H\"older inequalities not entailing +any approximation, all while retaining the linesearch-free nature of adaptive +schemes. Furthermore, we prove full sequence convergence without prior +knowledge of local H\"older constants nor of the order of H\"older continuity. +Numerical experiments make comparisons with baseline methods on diverse tasks +from machine learning covering both the locally and the globally H\"older +setting. + +
+
+
+
+
+ + ♻ ☆ Towards Audio Codec-based Speech Separation + + +
+ Recent improvements in neural audio codec (NAC) models have generated +interest in adopting pre-trained codecs for a variety of speech processing +applications to take advantage of the efficiencies gained from high +compression, but these have yet been applied to the speech separation (SS) +task. SS can benefit from high compression because the compute required for +traditional SS models makes them impractical for many edge computing use cases. +However, SS is a waveform-masking task where compression tends to introduce +distortions that severely impact performance. Here we propose a novel task of +Audio Codec-based SS, where SS is performed within the embedding space of a +NAC, and propose a new model, Codecformer, to address this task. At inference, +Codecformer achieves a 52x reduction in MAC while producing separation +performance comparable to a cloud deployment of Sepformer. This method charts a +new direction for performing efficient SS in practical scenarios. + +
+
+ comment: This paper was accepted by Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Read Between the Layers: Leveraging Multi-Layer Representations for + Rehearsal-Free Continual Learning with Pre-Trained Models + + +
+ We address the Continual Learning (CL) problem, wherein a model must learn a +sequence of tasks from non-stationary distributions while preserving prior +knowledge upon encountering new experiences. With the advancement of foundation +models, CL research has pivoted from the initial learning-from-scratch paradigm +towards utilizing generic features from large-scale pre-training. However, +existing approaches to CL with pre-trained models primarily focus on separating +class-specific features from the final representation layer and neglect the +potential of intermediate representations to capture low- and mid-level +features, which are more invariant to domain shifts. In this work, we propose +LayUP, a new prototype-based approach to CL that leverages second-order feature +statistics from multiple intermediate layers of a pre-trained network. Our +method is conceptually simple, does not require access to prior data, and works +out of the box with any foundation model. LayUP surpasses the state of the art +in four of the seven class-incremental learning benchmarks, all three +domain-incremental learning benchmarks and in six of the seven online continual +learning benchmarks, while significantly reducing memory and computational +requirements compared to existing baselines. Our results demonstrate that fully +exhausting the representational capacities of pre-trained models in CL goes +well beyond their final embeddings. + +
+
+ comment: Accepted for publication in Transactions of Machine Learning Research + (TMLR) journal +
+
+
+
+
+ + ♻ ☆ When Representations Align: Universality in Representation Learning + Dynamics + + +
+ Deep neural networks come in many sizes and architectures. The choice of +architecture, in conjunction with the dataset and learning algorithm, is +commonly understood to affect the learned neural representations. Yet, recent +results have shown that different architectures learn representations with +striking qualitative similarities. Here we derive an effective theory of +representation learning under the assumption that the encoding map from input +to hidden representation and the decoding map from representation to output are +arbitrary smooth functions. This theory schematizes representation learning +dynamics in the regime of complex, large architectures, where hidden +representations are not strongly constrained by the parametrization. We show +through experiments that the effective theory describes aspects of +representation learning dynamics across a range of deep networks with different +activation functions and architectures, and exhibits phenomena similar to the +"rich" and "lazy" regime. While many network behaviors depend quantitatively on +architecture, our findings point to certain behaviors that are widely conserved +once models are sufficiently flexible. + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Blockchain-empowered Federated Learning: Benefits, Challenges, and + Solutions + + +
+ Federated learning (FL) is a distributed machine learning approach that +protects user data privacy by training models locally on clients and +aggregating them on a parameter server. While effective at preserving privacy, +FL systems face limitations such as single points of failure, lack of +incentives, and inadequate security. To address these challenges, blockchain +technology is integrated into FL systems to provide stronger security, +fairness, and scalability. However, blockchain-empowered FL (BC-FL) systems +introduce additional demands on network, computing, and storage resources. This +survey provides a comprehensive review of recent research on BC-FL systems, +analyzing the benefits and challenges associated with blockchain integration. +We explore why blockchain is applicable to FL, how it can be implemented, and +the challenges and existing solutions for its integration. Additionally, we +offer insights on future research directions for the BC-FL system. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ BenchMARL: Benchmarking Multi-Agent Reinforcement Learning + + +
+ The field of Multi-Agent Reinforcement Learning (MARL) is currently facing a +reproducibility crisis. While solutions for standardized reporting have been +proposed to address the issue, we still lack a benchmarking tool that enables +standardization and reproducibility, while leveraging cutting-edge +Reinforcement Learning (RL) implementations. In this paper, we introduce +BenchMARL, the first MARL training library created to enable standardized +benchmarking across different algorithms, models, and environments. BenchMARL +uses TorchRL as its backend, granting it high performance and maintained +state-of-the-art implementations while addressing the broad community of MARL +PyTorch users. Its design enables systematic configuration and reporting, thus +allowing users to create and run complex benchmarks from simple one-line +inputs. BenchMARL is open-sourced on GitHub: +https://github.com/facebookresearch/BenchMARL + +
+
+
+
+
+ + ♻ ☆ Learning Rate Curriculum + + +
+ Most curriculum learning methods require an approach to sort the data samples +by difficulty, which is often cumbersome to perform. In this work, we propose a +novel curriculum learning approach termed Learning Rate Curriculum (LeRaC), +which leverages the use of a different learning rate for each layer of a neural +network to create a data-agnostic curriculum during the initial training +epochs. More specifically, LeRaC assigns higher learning rates to neural layers +closer to the input, gradually decreasing the learning rates as the layers are +placed farther away from the input. The learning rates increase at various +paces during the first training iterations, until they all reach the same +value. From this point on, the neural model is trained as usual. This creates a +model-level curriculum learning strategy that does not require sorting the +examples by difficulty and is compatible with any neural network, generating +higher performance levels regardless of the architecture. We conduct +comprehensive experiments on 12 data sets from the computer vision (CIFAR-10, +CIFAR-100, Tiny ImageNet, ImageNet-200, Food-101, UTKFace, PASCAL VOC), +language (BoolQ, QNLI, RTE) and audio (ESC-50, CREMA-D) domains, considering +various convolutional (ResNet-18, Wide-ResNet-50, DenseNet-121, YOLOv5), +recurrent (LSTM) and transformer (CvT, BERT, SepTr) architectures. We compare +our approach with the conventional training regime, as well as with Curriculum +by Smoothing (CBS), a state-of-the-art data-agnostic curriculum learning +approach. Unlike CBS, our performance improvements over the standard training +regime are consistent across all data sets and models. Furthermore, we +significantly surpass CBS in terms of training time (there is no additional +cost over the standard training regime for LeRaC). Our code is freely available +at: https://github.com/CroitoruAlin/LeRaC. + +
+
+ comment: Accepted at the International Journal of Computer Vision +
+
+
+
+
+ + ♻ ☆ Deep Learning-Based Spatiotemporal Multi-Event Reconstruction for Delay + Line Detectors + + +
+ Accurate observation of two or more particles within a very narrow time +window has always been a challenge in modern physics. It creates the +possibility of correlation experiments, such as the ground-breaking Hanbury +Brown-Twiss experiment, leading to new physical insights. For low-energy +electrons, one possibility is to use a microchannel plate with subsequent delay +lines for the readout of the incident particle hits, a setup called a Delay +Line Detector. The spatial and temporal coordinates of more than one particle +can be fully reconstructed outside a region called the dead radius. For +interesting events, where two electrons are close in space and time, the +determination of the individual positions of the electrons requires elaborate +peak finding algorithms. While classical methods work well with single particle +hits, they fail to identify and reconstruct events caused by multiple nearby +particles. To address this challenge, we present a new spatiotemporal machine +learning model to identify and reconstruct the position and time of such +multi-hit particle signals. This model achieves a much better resolution for +nearby particle hits compared to the classical approach, removing some of the +artifacts and reducing the dead radius by half. We show that machine learning +models can be effective in improving the spatiotemporal performance of delay +line detectors. + +
+
+
+
+
+ + ♻ ☆ FUTURE-AI: International consensus guideline for trustworthy and + deployable artificial intelligence in healthcare + + +
+ Despite major advances in artificial intelligence (AI) for medicine and +healthcare, the deployment and adoption of AI technologies remain limited in +real-world clinical practice. In recent years, concerns have been raised about +the technical, clinical, ethical and legal risks associated with medical AI. To +increase real world adoption, it is essential that medical AI tools are trusted +and accepted by patients, clinicians, health organisations and authorities. +This work describes the FUTURE-AI guideline as the first international +consensus framework for guiding the development and deployment of trustworthy +AI tools in healthcare. The FUTURE-AI consortium was founded in 2021 and +currently comprises 118 inter-disciplinary experts from 51 countries +representing all continents, including AI scientists, clinicians, ethicists, +and social scientists. Over a two-year period, the consortium defined guiding +principles and best practices for trustworthy AI through an iterative process +comprising an in-depth literature review, a modified Delphi survey, and online +consensus meetings. The FUTURE-AI framework was established based on 6 guiding +principles for trustworthy AI in healthcare, i.e. Fairness, Universality, +Traceability, Usability, Robustness and Explainability. Through consensus, a +set of 28 best practices were defined, addressing technical, clinical, legal +and socio-ethical dimensions. The recommendations cover the entire lifecycle of +medical AI, from design, development and validation to regulation, deployment, +and monitoring. FUTURE-AI is a risk-informed, assumption-free guideline which +provides a structured approach for constructing medical AI tools that will be +trusted, deployed and adopted in real-world practice. Researchers are +encouraged to take the recommendations into account in proof-of-concept stages +to facilitate future translation towards clinical practice of medical AI. + +
+
+
+
+
+ + ♻ ☆ FuXi-S2S: A machine learning model that outperforms conventional global + subseasonal forecast models + + +
+ Skillful subseasonal forecasts are crucial for various sectors of society but +pose a grand scientific challenge. Recently, machine learning based weather +forecasting models outperform the most successful numerical weather predictions +generated by the European Centre for Medium-Range Weather Forecasts (ECMWF), +but have not yet surpassed conventional models at subseasonal timescales. This +paper introduces FuXi Subseasonal-to-Seasonal (FuXi-S2S), a machine learning +model that provides global daily mean forecasts up to 42 days, encompassing +five upper-air atmospheric variables at 13 pressure levels and 11 surface +variables. FuXi-S2S, trained on 72 years of daily statistics from ECMWF ERA5 +reanalysis data, outperforms the ECMWF's state-of-the-art +Subseasonal-to-Seasonal model in ensemble mean and ensemble forecasts for total +precipitation and outgoing longwave radiation, notably enhancing global +precipitation forecast. The improved performance of FuXi-S2S can be primarily +attributed to its superior capability to capture forecast uncertainty and +accurately predict the Madden-Julian Oscillation (MJO), extending the skillful +MJO prediction from 30 days to 36 days. Moreover, FuXi-S2S not only captures +realistic teleconnections associated with the MJO, but also emerges as a +valuable tool for discovering precursor signals, offering researchers insights +and potentially establishing a new paradigm in Earth system science research. + +
+
+
+
+
+ + ♻ ☆ Representation Surgery: Theory and Practice of Affine Steering ICML 2024 + + +
+ Language models often exhibit undesirable behavior, e.g., generating toxic or +gender-biased text. In the case of neural language models, an encoding of the +undesirable behavior is often present in the model's representations. Thus, one +natural (and common) approach to prevent the model from exhibiting undesirable +behavior is to steer the model's representations in a manner that reduces the +probability of it generating undesirable text. This paper investigates the +formal and empirical properties of steering functions, i.e., transformation of +the neural language model's representations that alter its behavior. First, we +derive two optimal, in the least-squares sense, affine steering functions under +different constraints. Our theory provides justification for existing +approaches and offers a novel, improved steering approach. Second, we offer a +series of experiments that demonstrate the empirical effectiveness of the +methods in mitigating bias and reducing toxic generation. + +
+
+ comment: Accepted in ICML 2024 +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence in Industry 4.0: A Review of Integration + Challenges for Industrial Systems + + +
+ In Industry 4.0, Cyber-Physical Systems (CPS) generate vast data sets that +can be leveraged by Artificial Intelligence (AI) for applications including +predictive maintenance and production planning. However, despite the +demonstrated potential of AI, its widespread adoption in sectors like +manufacturing remains limited. Our comprehensive review of recent literature, +including standards and reports, pinpoints key challenges: system integration, +data-related issues, managing workforce-related concerns and ensuring +trustworthy AI. A quantitative analysis highlights particular challenges and +topics that are important for practitioners but still need to be sufficiently +investigated by academics. The paper briefly discusses existing solutions to +these challenges and proposes avenues for future research. We hope that this +survey serves as a resource for practitioners evaluating the cost-benefit +implications of AI in CPS and for researchers aiming to address these urgent +challenges. + +
+
+ comment: 17 pages, 4 figures, 1 table, accepted for the 22nd IEEE + International Conference on Industrial Informatics (INDIN) +
+
+
+
+
+ + ♻ ☆ Second Maximum of a Gaussian Random Field and Exact (t-)Spacing test + + +
+ In this article, we introduce the novel concept of the second maximum of a +Gaussian random field on a Riemannian submanifold. This second maximum serves +as a powerful tool for characterizing the distribution of the maximum. By +utilizing an ad-hoc Kac Rice formula, we derive the explicit form of the +maximum's distribution, conditioned on the second maximum and some regressed +component of the Riemannian Hessian. This approach results in an exact test, +based on the evaluation of spacing between these maxima, which we refer to as +the spacing test. + We investigate the applicability of this test in detecting sparse +alternatives within Gaussian symmetric tensors, continuous sparse +deconvolution, and two-layered neural networks with smooth rectifiers. Our +theoretical results are supported by numerical experiments, which illustrate +the calibration and power of the proposed tests. More generally, this test can +be applied to any Gaussian random field on a Riemannian manifold, and we +provide a general framework for the application of the spacing test in +continuous sparse kernel regression. + Furthermore, when the variance-covariance function of the Gaussian random +field is known up to a scaling factor, we derive an exact Studentized version +of our test, coined the $t$-spacing test. This test is perfectly calibrated +under the null hypothesis and has high power for detecting sparse alternatives. + +
+
+ comment: 5 figures, 22 pages main document, 2 pages supplements +
+
+
+
+
+ + ♻ ☆ Implicit regularization of deep residual networks towards neural ODEs ICLR 2024 + + +
+ Residual neural networks are state-of-the-art deep learning models. Their +continuous-depth analog, neural ordinary differential equations (ODEs), are +also widely used. Despite their success, the link between the discrete and +continuous models still lacks a solid mathematical foundation. In this article, +we take a step in this direction by establishing an implicit regularization of +deep residual networks towards neural ODEs, for nonlinear networks trained with +gradient flow. We prove that if the network is initialized as a discretization +of a neural ODE, then such a discretization holds throughout training. Our +results are valid for a finite training time, and also as the training time +tends to infinity provided that the network satisfies a Polyak-Lojasiewicz +condition. Importantly, this condition holds for a family of residual networks +where the residuals are two-layer perceptrons with an overparameterization in +width that is only linear, and implies the convergence of gradient flow to a +global minimum. Numerical experiments illustrate our results. + +
+
+ comment: ICLR 2024 (spotlight). 40 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ MRPD: Undersampled MRI reconstruction by prompting a large latent + diffusion model + + +
+ Implicit visual knowledge in a large latent diffusion model (LLDM) +pre-trained on natural images is rich and hypothetically universal to natural +and medical images. To test this hypothesis from a practical perspective, we +propose a novel framework for undersampled MRI Reconstruction by Prompting a +large latent Diffusion model (MRPD). While the existing methods trained on MRI +datasets are typically of limited generalizability toward diverse data +acquisition scenarios, MRPD supports unsupervised and universally adaptive MRI +reconstruction. For unsupervised reconstruction, MRSampler guides LLDM with a +random-phase-modulated hard-to-soft control. With any single- or +multiple-source MRI dataset, MRPD's performance is boosted universally by a +lightweight MRAdapter that only finetunes the LLDM's autoencoder. Experiments +on FastMRI and IXI show that MRPD is the only model that supports both MRI +database-free and database-available scenarios and attains the best +generalizability towards out-of-domain (OOD) samplings, contrasts, and organs +among compared unsupervised, supervised, and MRI diffusion methods. To our +knowledge, MRPD is the first method that empirically shows the universal +prowess of an LLDM pre-trained on vast natural images for MRI. Our official +implementation is at https://github.com/Z7Gao/MRPD. + +
+
+ comment: 10 pages, 5 figures, 7 tables, 1 pseudocode +
+
+
+
+
+ + ♻ ☆ Semiring Activation in Neural Networks + + +
+ We introduce a class of trainable nonlinear operators based on semirings that +are suitable for use in neural networks. These operators generalize the +traditional alternation of linear operators with activation functions in neural +networks. Semirings are algebraic structures that describe a generalised +notation of linearity, greatly expanding the range of trainable operators that +can be included in neural networks. In fact, max- or min-pooling operations are +convolutions in the tropical semiring with a fixed kernel. + We perform experiments where we replace the activation functions for +trainable semiring-based operators to show that these are viable operations to +include in fully connected as well as convolutional neural networks (ConvNeXt). +We discuss some of the challenges of replacing traditional activation functions +with trainable semiring activations and the trade-offs of doing so. + +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks as an Enabler of Terahertz-based Flow-guided + Nanoscale Localization over Highly Erroneous Raw Data + + +
+ Contemporary research advances in nanotechnology and material science are +rooted in the emergence of nanodevices as a versatile tool that harmonizes +sensing, computing, wireless communication, data storage, and energy +harvesting. These devices offer novel pathways for disease diagnostics, +treatment, and monitoring within the bloodstreams. Ensuring precise +localization of events of diagnostic interest, which underpins the concept of +flow-guided in-body nanoscale localization, would provide an added diagnostic +value to the detected events. Raw data generated by the nanodevices is pivotal +for this localization and consist of an event detection indicator and the time +elapsed since the last passage of a nanodevice through the heart. The energy +constraints of the nanodevices lead to intermittent operation and unreliable +communication, intrinsically affecting this data. This posits a need for +comprehensively modelling the features of this data. These imperfections also +have profound implications for the viability of existing flow-guided +localization approaches, which are ill-prepared to address the intricacies of +the environment. Our first contribution lies in an analytical model of raw data +for flow-guided localization, dissecting how communication and energy +capabilities influence the nanodevices' data output. This model acts as a vital +bridge, reconciling idealized assumptions with practical challenges of +flow-guided localization. Toward addressing these practical challenges, we also +present an integration of Graph Neural Networks (GNNs) into the flow-guided +localization paradigm. GNNs excel in capturing complex dynamic interactions +inherent to the localization of events sensed by the nanodevices. Our results +highlight the potential of GNNs not only to enhance localization accuracy but +also extend coverage to encompass the entire bloodstream. + +
+
+ comment: 16 pages, 16 figures, 6 tables, 45 references +
+
+
+
+
+ + ♻ ☆ Temporal Knowledge Graph Question Answering: A Survey + + +
+ Knowledge Base Question Answering (KBQA) has been a long-standing field to +answer questions based on knowledge bases. Recently, the evolving dynamics of +knowledge have attracted a growing interest in Temporal Knowledge Graph +Question Answering (TKGQA), an emerging task to answer temporal questions. +However, this field grapples with ambiguities in defining temporal questions +and lacks a systematic categorization of existing methods for TKGQA. In +response, this paper provides a thorough survey from two perspectives: the +taxonomy of temporal questions and the methodological categorization for TKGQA. +Specifically, we first establish a detailed taxonomy of temporal questions +engaged in prior studies. Subsequently, we provide a comprehensive review of +TKGQA techniques of two categories: semantic parsing-based and TKG +embedding-based. Building on this review, the paper outlines potential research +directions aimed at advancing the field of TKGQA. This work aims to serve as a +comprehensive reference for TKGQA and to stimulate further research. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ MT-HCCAR: Multi-Task Deep Learning with Hierarchical Classification and + Attention-based Regression for Cloud Property Retrieval ECML + + +
+ In the realm of Earth science, effective cloud property retrieval, +encompassing cloud masking, cloud phase classification, and cloud optical +thickness (COT) prediction, remains pivotal. Traditional methodologies +necessitate distinct models for each sensor instrument due to their unique +spectral characteristics. Recent strides in Earth Science research have +embraced machine learning and deep learning techniques to extract features from +satellite datasets' spectral observations. However, prevailing approaches lack +novel architectures accounting for hierarchical relationships among retrieval +tasks. Moreover, considering the spectral diversity among existing sensors, the +development of models with robust generalization capabilities over different +sensor datasets is imperative. Surprisingly, there is a dearth of methodologies +addressing the selection of an optimal model for diverse datasets. In response, +this paper introduces MT-HCCAR, an end-to-end deep learning model employing +multi-task learning to simultaneously tackle cloud masking, cloud phase +retrieval (classification tasks), and COT prediction (a regression task). The +MT-HCCAR integrates a hierarchical classification network (HC) and a +classification-assisted attention-based regression network (CAR), enhancing +precision and robustness in cloud labeling and COT prediction. Additionally, a +comprehensive model selection method rooted in K-fold cross-validation, one +standard error rule, and two introduced performance scores is proposed to +select the optimal model over three simulated satellite datasets OCI, VIIRS, +and ABI. The experiments comparing MT-HCCAR with baseline methods, the ablation +studies, and the model selection affirm the superiority and the generalization +capabilities of MT-HCCAR. + +
+
+ comment: 14 pages, 3 figures, accepted by ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Revitalizing Multivariate Time Series Forecasting: Learnable + Decomposition with Inter-Series Dependencies and Intra-Series Variations + Modeling + + +
+ Predicting multivariate time series is crucial, demanding precise modeling of +intricate patterns, including inter-series dependencies and intra-series +variations. Distinctive trend characteristics in each time series pose +challenges, and existing methods, relying on basic moving average kernels, may +struggle with the non-linear structure and complex trends in real-world data. +Given that, we introduce a learnable decomposition strategy to capture dynamic +trend information more reasonably. Additionally, we propose a dual attention +module tailored to capture inter-series dependencies and intra-series +variations simultaneously for better time series forecasting, which is +implemented by channel-wise self-attention and autoregressive self-attention. +To evaluate the effectiveness of our method, we conducted experiments across +eight open-source datasets and compared it with the state-of-the-art methods. +Through the comparison results, our Leddam (LEarnable Decomposition and Dual +Attention Module) not only demonstrates significant advancements in predictive +performance, but also the proposed decomposition strategy can be plugged into +other methods with a large performance-boosting, from 11.87% to 48.56% MSE +error degradation. + +
+
+
+
+
+ + ♻ ☆ FuXi-ENS: A machine learning model for medium-range ensemble weather + forecasting + + +
+ Ensemble forecasting is crucial for improving weather predictions, especially +for forecasts of extreme events. Constructing an ensemble prediction system +(EPS) based on conventional NWP models is highly computationally expensive. ML +models have emerged as valuable tools for deterministic weather forecasts, +providing forecasts with significantly reduced computational requirements and +even surpassing the forecast performance of traditional NWP models. However, +challenges arise when applying ML models to ensemble forecasting. Recent ML +models, such as GenCast and SEEDS model, rely on the ERA5 EDA or operational +NWP ensemble members for forecast generation. Their spatial resolution is also +considered too coarse for many applications. To overcome these limitations, we +introduce FuXi-ENS, an advanced ML model designed to deliver 6-hourly global +ensemble weather forecasts up to 15 days. This model runs at a significantly +increased spatial resolution of 0.25\textdegree, incorporating 5 atmospheric +variables at 13 pressure levels, along with 13 surface variables. By leveraging +the inherent probabilistic nature of Variational AutoEncoder (VAE), FuXi-ENS +optimizes a loss function that combines the CRPS and the KL divergence between +the predicted and target distribution, facilitating the incorporation of +flow-dependent perturbations in both initial conditions and forecast. This +innovative approach makes FuXi-ENS an advancement over the traditional ones +that use L1 loss combined with the KL loss in standard VAE models for ensemble +weather forecasting. Results demonstrate that FuXi-ENS outperforms ensemble +forecasts from the ECMWF, a world leading NWP model, in the CRPS of 98.1% of +360 variable and forecast lead time combinations. This achievement underscores +the potential of the FuXi-ENS model to enhance ensemble weather forecasts, +offering a promising direction for further development in this field. + +
+
+
+
+
+ + ♻ ☆ Causality for Tabular Data Synthesis: A High-Order Structure Causal + Benchmark Framework + + +
+ Tabular synthesis models remain ineffective at capturing complex +dependencies, and the quality of synthetic data is still insufficient for +comprehensive downstream tasks, such as prediction under distribution shifts, +automated decision-making, and cross-table understanding. A major challenge is +the lack of prior knowledge about underlying structures and high-order +relationships in tabular data. We argue that a systematic evaluation on +high-order structural information for tabular data synthesis is the first step +towards solving the problem. In this paper, we introduce high-order structural +causal information as natural prior knowledge and provide a benchmark framework +for the evaluation of tabular synthesis models. The framework allows us to +generate benchmark datasets with a flexible range of data generation processes +and to train tabular synthesis models using these datasets for further +evaluation. We propose multiple benchmark tasks, high-order metrics, and causal +inference tasks as downstream tasks for evaluating the quality of synthetic +data generated by the trained models. Our experiments demonstrate to leverage +the benchmark framework for evaluating the model capability of capturing +high-order structural causal information. Furthermore, our benchmarking results +provide an initial assessment of state-of-the-art tabular synthesis models. +They have clearly revealed significant gaps between ideal and actual +performance and how baseline methods differ. Our benchmark framework is +available at URL https://github.com/TURuibo/CauTabBench. + +
+
+
+
+
+ + ♻ ☆ Shedding the Bits: Pushing the Boundaries of Quantization with + Minifloats on FPGAs + + +
+ Post-training quantization (PTQ) is a powerful technique for model +compression, reducing the numerical precision in neural networks without +additional training overhead. Recent works have investigated adopting 8-bit +floating-point formats(FP8) in the context of PTQ for model inference. However, +floating-point formats smaller than 8 bits and their relative comparison in +terms of accuracy-hardware cost with integers remains unexplored on FPGAs. In +this work, we present minifloats, which are reduced-precision floating-point +formats capable of further reducing the memory footprint, latency, and energy +cost of a model while approaching full-precision model accuracy. We implement a +custom FPGA-based multiply-accumulate operator library and explore the vast +design space, comparing minifloat and integer representations across 3 to 8 +bits for both weights and activations. We also examine the applicability of +various integerbased quantization techniques to minifloats. Our experiments +show that minifloats offer a promising alternative for emerging workloads such +as vision transformers. + +
+
+ comment: Accepted in FPL (International Conference on Field-Programmable Logic + and Applications) 2024 conference. Revised with updated results +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning with Dynamic Graphs for Adaptive Informative + Path Planning + + +
+ Autonomous robots are often employed for data collection due to their +efficiency and low labour costs. A key task in robotic data acquisition is +planning paths through an initially unknown environment to collect observations +given platform-specific resource constraints, such as limited battery life. +Adaptive online path planning in 3D environments is challenging due to the +large set of valid actions and the presence of unknown occlusions. To address +these issues, we propose a novel deep reinforcement learning approach for +adaptively replanning robot paths to map targets of interest in unknown 3D +environments. A key aspect of our approach is a dynamically constructed graph +that restricts planning actions local to the robot, allowing us to react to +newly discovered static obstacles and targets of interest. For replanning, we +propose a new reward function that balances between exploring the unknown +environment and exploiting online-discovered targets of interest. Our +experiments show that our method enables more efficient target discovery +compared to state-of-the-art learning and non-learning baselines. We also +showcase our approach for orchard monitoring using an unmanned aerial vehicle +in a photorealistic simulator. We open-source our code and model at: +https://github.com/dmar-bonn/ipp-rl-3d. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Value-Incentivized Preference Optimization: A Unified Approach to Online + and Offline RLHF + + +
+ Reinforcement learning from human feedback (RLHF) has demonstrated great +promise in aligning large language models (LLMs) with human preference. +Depending on the availability of preference data, both online and offline RLHF +are active areas of investigation. A key bottleneck is understanding how to +incorporate uncertainty estimation in the reward function learned from the +preference data for RLHF, regardless of how the preference data is collected. +While the principles of optimism or pessimism under uncertainty are +well-established in standard reinforcement learning (RL), a +practically-implementable and theoretically-grounded form amenable to large +language models is not yet available, as standard techniques for constructing +confidence intervals become intractable under arbitrary policy +parameterizations. + In this paper, we introduce a unified approach to online and offline RLHF -- +value-incentivized preference optimization (VPO) -- which regularizes the +maximum-likelihood estimate of the reward function with the corresponding value +function, modulated by a $\textit{sign}$ to indicate whether the optimism or +pessimism is chosen. VPO also directly optimizes the policy with implicit +reward modeling, and therefore shares a simpler RLHF pipeline similar to direct +preference optimization. Theoretical guarantees of VPO are provided for both +online and offline settings, matching the rates of their standard RL +counterparts. Moreover, experiments on text summarization and dialog verify the +practicality and effectiveness of VPO. + +
+
+
+
+
+ + ♻ ☆ Engression: Extrapolation through the Lens of Distributional Regression + + +
+ Distributional regression aims to estimate the full conditional distribution +of a target variable, given covariates. Popular methods include linear and +tree-ensemble based quantile regression. We propose a neural network-based +distributional regression methodology called `engression'. An engression model +is generative in the sense that we can sample from the fitted conditional +distribution and is also suitable for high-dimensional outcomes. Furthermore, +we find that modelling the conditional distribution on training data can +constrain the fitted function outside of the training support, which offers a +new perspective to the challenging extrapolation problem in nonlinear +regression. In particular, for `pre-additive noise' models, where noise is +added to the covariates before applying a nonlinear transformation, we show +that engression can successfully perform extrapolation under some assumptions +such as monotonicity, whereas traditional regression approaches such as +least-squares or quantile regression fall short under the same assumptions. Our +empirical results, from both simulated and real data, validate the +effectiveness of the engression method and indicate that the pre-additive noise +model is typically suitable for many real-world scenarios. The software +implementations of engression are available in both R and Python. + +
+
+
+
+
+ + ♻ ☆ Gotta match 'em all: Solution diversification in graph matching matched + filters + + +
+ We present a novel approach for finding multiple noisily embedded template +graphs in a very large background graph. Our method builds upon the +graph-matching-matched-filter technique proposed in Sussman et al., with the +discovery of multiple diverse matchings being achieved by iteratively +penalizing a suitable node-pair similarity matrix in the matched filter +algorithm. In addition, we propose algorithmic speed-ups that greatly enhance +the scalability of our matched-filter approach. We present theoretical +justification of our methodology in the setting of correlated Erdos-Renyi +graphs, showing its ability to sequentially discover multiple templates under +mild model conditions. We additionally demonstrate our method's utility via +extensive experiments both using simulated models and real-world dataset, +include human brain connectomes and a large transactional knowledge base. + +
+
+ comment: 27 pages, 12 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning for + Sparse Architectural Large Language Models + + +
+ Parameter-efficient fine-tuning (PEFT) is crucial for customizing Large +Language Models (LLMs) with constrained resources. Although there have been +various PEFT methods for dense-architecture LLMs, PEFT for sparse-architecture +LLMs is still underexplored. In this work, we study the PEFT method for LLMs +with the Mixture-of-Experts (MoE) architecture and the contents of this work +are mainly threefold: (1) We investigate the dispersion degree of the activated +experts in customized tasks, and found that the routing distribution for a +specific task tends to be highly concentrated, while the distribution of +activated experts varies significantly across different tasks. (2) We propose +Expert-Specialized Fine-Tuning, or ESFT, which tunes the experts most relevant +to downstream tasks while freezing the other experts and modules; experimental +results demonstrate that our method not only improves the tuning efficiency, +but also matches or even surpasses the performance of full-parameter +fine-tuning. (3) We further analyze the impact of the MoE architecture on +expert-specialized fine-tuning. We find that MoE models with finer-grained +experts are more advantageous in selecting the combination of experts that are +most relevant to downstream tasks, thereby enhancing both the training +efficiency and effectiveness. Our code is available at +https://github.com/deepseek-ai/ESFT. + +
+
+
+
+
+ + ♻ ☆ Identification of Novel Modes in Generative Models via Fourier-based + Differential Clustering + + +
+ An interpretable comparison of generative models requires the identification +of sample types produced more frequently by each of the involved models. While +several quantitative scores have been proposed in the literature to rank +different generative models, such score-based evaluations do not reveal the +nuanced differences between the generative models in capturing various sample +types. In this work, we attempt to solve a differential clustering problem to +detect sample types expressed differently by two generative models. To solve +the differential clustering problem, we propose a method called Fourier-based +Identification of Novel Clusters (FINC) to identify modes produced by a +generative model with a higher frequency in comparison to a reference +distribution. FINC provides a scalable stochastic algorithm based on random +Fourier features to estimate the eigenspace of kernel covariance matrices of +two generative models and utilize the principal eigendirections to detect the +sample types present more dominantly in each model. We demonstrate the +application of the FINC method to large-scale computer vision datasets and +generative model frameworks. Our numerical results suggest the scalability of +the developed Fourier-based method in highlighting the sample types produced +with different frequencies by widely-used generative models. Code is available +at \url{https://github.com/buyeah1109/FINC} + +
+
+
+
+
+ + ♻ ☆ Predicting the duration of traffic incidents for Sydney greater + metropolitan area using machine learning methods + + +
+ This research presents a comprehensive approach to predicting the duration of +traffic incidents and classifying them as short-term or long-term across the +Sydney Metropolitan Area. Leveraging a dataset that encompasses detailed +records of traffic incidents, road network characteristics, and socio-economic +indicators, we train and evaluate a variety of advanced machine learning models +including Gradient Boosted Decision Trees (GBDT), Random Forest, LightGBM, and +XGBoost. The models are assessed using Root Mean Square Error (RMSE) for +regression tasks and F1 score for classification tasks. + Our experimental results demonstrate that XGBoost and LightGBM outperform +conventional models with XGBoost achieving the lowest RMSE of 33.7 for +predicting incident duration and highest classification F1 score of 0.62 for a +30-minute duration threshold. For classification, the 30-minute threshold +balances performance with 70.84% short-term duration classification accuracy +and 62.72% long-term duration classification accuracy. Feature importance +analysis, employing both tree split counts and SHAP values, identifies the +number of affected lanes, traffic volume, and types of primary and secondary +vehicles as the most influential features. + The proposed methodology not only achieves high predictive accuracy but also +provides stakeholders with vital insights into factors contributing to incident +durations. These insights enable more informed decision-making for traffic +management and response strategies. The code is available by the link: +https://github.com/Future-Mobility-Lab/SydneyIncidents + +
+
+
+
+
+ + ♻ ☆ Language-Guided World Models: A Model-Based Approach to AI Control ACL 2024 + + +
+ This paper introduces the concept of Language-Guided World Models (LWMs) -- +probabilistic models that can simulate environments by reading texts. Agents +equipped with these models provide humans with more extensive and efficient +control, allowing them to simultaneously alter agent behaviors in multiple +tasks via natural verbal communication. In this work, we take initial steps in +developing robust LWMs that can generalize to compositionally novel language +descriptions. We design a challenging world modeling benchmark based on the +game of MESSENGER (Hanjie et al., 2021), featuring evaluation settings that +require varying degrees of compositional generalization. Our experiments reveal +the lack of generalizability of the state-of-the-art Transformer model, as it +offers marginal improvements in simulation quality over a no-text baseline. We +devise a more robust model by fusing the Transformer with the EMMA attention +mechanism (Hanjie et al., 2021). Our model substantially outperforms the +Transformer and approaches the performance of a model with an oracle semantic +parsing and grounding capability. To demonstrate the practicality of this model +in improving AI safety and transparency, we simulate a scenario in which the +model enables an agent to present plans to a human before execution, and to +revise plans based on their language feedback. + +
+
+ comment: SpLU-RoboNLP workshop at ACL 2024 +
+
+
+
+
+ + ♻ ☆ HOPE: A Reinforcement Learning-based Hybrid Policy Path Planner for + Diverse Parking Scenarios + + +
+ Automated parking stands as a highly anticipated application of autonomous +driving technology. However, existing path planning methodologies fall short of +addressing this need due to their incapability to handle the diverse and +complex parking scenarios in reality. While non-learning methods provide +reliable planning results, they are vulnerable to intricate occasions, whereas +learning-based ones are good at exploration but unstable in converging to +feasible solutions. To leverage the strengths of both approaches, we introduce +Hybrid pOlicy Path plannEr (HOPE). This novel solution integrates a +reinforcement learning agent with Reeds-Shepp curves, enabling effective +planning across diverse scenarios. HOPE guides the exploration of the +reinforcement learning agent by applying an action mask mechanism and employs a +transformer to integrate the perceived environmental information with the mask. +To facilitate the training and evaluation of the proposed planner, we propose a +criterion for categorizing the difficulty level of parking scenarios based on +space and obstacle distribution. Experimental results demonstrate that our +approach outperforms typical rule-based algorithms and traditional +reinforcement learning methods, showing higher planning success rates and +generalization across various scenarios. We also conduct real-world experiments +to verify the practicability of HOPE. The code for our solution will be openly +available on \href{GitHub}{https://github.com/jiamiya/HOPE}. + +
+
+ comment: 10 pages, 6 tables, 5 figures, 4 page appendix +
+
+
+
+
+ + ♻ ☆ What can we learn from quantum convolutional neural networks? + + +
+ We can learn from analyzing quantum convolutional neural networks (QCNNs) +that: 1) working with quantum data can be perceived as embedding physical +system parameters through a hidden feature map; 2) their high performance for +quantum phase recognition can be attributed to generation of a very suitable +basis set during the ground state embedding, where quantum criticality of spin +models leads to basis functions with rapidly changing features; 3) pooling +layers of QCNNs are responsible for picking those basis functions that can +contribute to forming a high-performing decision boundary, and the learning +process corresponds to adapting the measurement such that few-qubit operators +are mapped to full-register observables; 4) generalization of QCNN models +strongly depends on the embedding type, and that rotation-based feature maps +with the Fourier basis require careful feature engineering; 5) accuracy and +generalization of QCNNs with readout based on a limited number of shots favor +the ground state embeddings and associated physics-informed models. We +demonstrate these points in simulation, where our results shed light on +classification for physical processes, relevant for applications in sensing. +Finally, we show that QCNNs with properly chosen ground state embeddings can be +used for fluid dynamics problems, expressing shock wave solutions with good +generalization and proven trainability. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Deep Copula-Based Survival Analysis for Dependent Censoring with + Identifiability Guarantees AAAI 2024 + + +
+ Censoring is the central problem in survival analysis where either the +time-to-event (for instance, death), or the time-tocensoring (such as loss of +follow-up) is observed for each sample. The majority of existing machine +learning-based survival analysis methods assume that survival is conditionally +independent of censoring given a set of covariates; an assumption that cannot +be verified since only marginal distributions is available from the data. The +existence of dependent censoring, along with the inherent bias in current +estimators has been demonstrated in a variety of applications, accentuating the +need for a more nuanced approach. However, existing methods that adjust for +dependent censoring require practitioners to specify the ground truth copula. +This requirement poses a significant challenge for practical applications, as +model misspecification can lead to substantial bias. In this work, we propose a +flexible deep learning-based survival analysis method that simultaneously +accommodate for dependent censoring and eliminates the requirement for +specifying the ground truth copula. We theoretically prove the identifiability +of our model under a broad family of copulas and survival distributions. +Experiments results from a wide range of datasets demonstrate that our approach +successfully discerns the underlying dependency structure and significantly +reduces survival estimation bias when compared to existing methods. + +
+
+ comment: To appear in AAAI 2024 +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ VCoME: Verbal Video Composition with Multimodal Editing Effects + + +
+ Verbal videos, featuring voice-overs or text overlays, provide valuable +content but present significant challenges in composition, especially when +incorporating editing effects to enhance clarity and visual appeal. In this +paper, we introduce the novel task of verbal video composition with editing +effects. This task aims to generate coherent and visually appealing verbal +videos by integrating multimodal editing effects across textual, visual, and +audio categories. To achieve this, we curate a large-scale dataset of video +effects compositions from publicly available sources. We then formulate this +task as a generative problem, involving the identification of appropriate +positions in the verbal content and the recommendation of editing effects for +these positions. To address this task, we propose VCoME, a general framework +that employs a large multimodal model to generate editing effects for video +composition. Specifically, VCoME takes in the multimodal video context and +autoregressively outputs where to apply effects within the verbal content and +which effects are most appropriate for each position. VCoME also supports +prompt-based control of composition density and style, providing substantial +flexibility for diverse applications. Through extensive quantitative and +qualitative evaluations, we clearly demonstrate the effectiveness of VCoME. A +comprehensive user study shows that our method produces videos of professional +quality while being 85$\times$ more efficient than professional editors. + +
+
+
+
+
+ + ☆ Improving Audio Generation with Visual Enhanced Caption + + +
+ Generative models have shown significant achievements in audio generation +tasks. However, existing models struggle with complex and detailed prompts, +leading to potential performance degradation. We hypothesize that this problem +stems from the low quality and relatively small quantity of training data. In +this work, we aim to create a large-scale audio dataset with rich captions for +improving audio generation models. We develop an automated pipeline to generate +detailed captions for audio-visual datasets by transforming predicted visual +captions, audio captions, and tagging labels into comprehensive descriptions +using a Large Language Model (LLM). We introduce Sound-VECaps, a dataset +comprising 1.66M high-quality audio-caption pairs with enriched details +including audio event orders, occurred places and environment information. We +demonstrate that training with Sound-VECaps significantly enhances the +capability of text-to-audio generation models to comprehend and generate audio +from complex input prompts, improving overall system performance. Furthermore, +we conduct ablation studies of Sound-VECaps across several audio-language +tasks, suggesting its potential in advancing audio-text representation +learning. Our dataset and models are available online. + +
+
+ comment: 5 pages with 1 appendix +
+
+
+
+
+ + ☆ TSC-PCAC: Voxel Transformer and Sparse Convolution Based Point Cloud + Attribute Compression for 3D Broadcasting + + +
+ Point cloud has been the mainstream representation for advanced 3D +applications, such as virtual reality and augmented reality. However, the +massive data amounts of point clouds is one of the most challenging issues for +transmission and storage. In this paper, we propose an end-to-end voxel +Transformer and Sparse Convolution based Point Cloud Attribute Compression +(TSC-PCAC) for 3D broadcasting. Firstly, we present a framework of the +TSC-PCAC, which include Transformer and Sparse Convolutional Module (TSCM) +based variational autoencoder and channel context module. Secondly, we propose +a two-stage TSCM, where the first stage focuses on modeling local dependencies +and feature representations of the point clouds, and the second stage captures +global features through spatial and channel pooling encompassing larger +receptive fields. This module effectively extracts global and local interpoint +relevance to reduce informational redundancy. Thirdly, we design a TSCM based +channel context module to exploit interchannel correlations, which improves the +predicted probability distribution of quantized latent representations and thus +reduces the bitrate. Experimental results indicate that the proposed TSC-PCAC +method achieves an average of 38.53%, 21.30%, and 11.19% Bjontegaard Delta +bitrate reductions compared to the Sparse-PCAC, NF-PCAC, and G-PCC v23 methods, +respectively. The encoding/decoding time costs are reduced up to 97.68%/98.78% +on average compared to the Sparse-PCAC. The source code and the trained models +of the TSC-PCAC are available at https://github.com/igizuxo/TSC-PCAC. + +
+
+
+
+
+ + ☆ Unsupervised Video Summarization via Reinforcement Learning and a + Trained Evaluator + + +
+ This paper presents a novel approach for unsupervised video summarization +using reinforcement learning. It aims to address the existing limitations of +current unsupervised methods, including unstable training of adversarial +generator-discriminator architectures and reliance on hand-crafted reward +functions for quality evaluation. The proposed method is based on the concept +that a concise and informative summary should result in a reconstructed video +that closely resembles the original. The summarizer model assigns an importance +score to each frame and generates a video summary. In the proposed scheme, +reinforcement learning, coupled with a unique reward generation pipeline, is +employed to train the summarizer model. The reward generation pipeline trains +the summarizer to create summaries that lead to improved reconstructions. It +comprises a generator model capable of reconstructing masked frames from a +partially masked video, along with a reward mechanism that compares the +reconstructed video from the summary against the original. The video generator +is trained in a self-supervised manner to reconstruct randomly masked frames, +enhancing its ability to generate accurate summaries. This training pipeline +results in a summarizer model that better mimics human-generated video +summaries compared to methods relying on hand-crafted rewards. The training +process consists of two stable and isolated training steps, unlike adversarial +architectures. Experimental results demonstrate promising performance, with +F-scores of 62.3 and 54.5 on TVSum and SumMe datasets, respectively. +Additionally, the inference stage is 300 times faster than our previously +reported state-of-the-art method. + +
+
+
+
+
+ + ♻ ☆ EasyAnimate: A High-Performance Long Video Generation Method based on + Transformer Architecture + + +
+ This paper presents EasyAnimate, an advanced method for video generation that +leverages the power of transformer architecture for high-performance outcomes. +We have expanded the DiT framework originally designed for 2D image synthesis +to accommodate the complexities of 3D video generation by incorporating a +motion module block. It is used to capture temporal dynamics, thereby ensuring +the production of consistent frames and seamless motion transitions. The motion +module can be adapted to various DiT baseline methods to generate video with +different styles. It can also generate videos with different frame rates and +resolutions during both training and inference phases, suitable for both images +and videos. Moreover, we introduce slice VAE, a novel approach to condense the +temporal axis, facilitating the generation of long duration videos. Currently, +EasyAnimate exhibits the proficiency to generate videos with 144 frames. We +provide a holistic ecosystem for video production based on DiT, encompassing +aspects such as data pre-processing, VAE training, DiT models training (both +the baseline model and LoRA model), and end-to-end video inference. Code is +available at: https://github.com/aigc-apps/EasyAnimate. We are continuously +working to enhance the performance of our method. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 86 + +
+
+
+ + ☆ HAF-RM: A Hybrid Alignment Framework for Reward Model Training + + +
+ The reward model has become increasingly important in alignment, assessment, +and data construction for large language models (LLMs). Most existing +researchers focus on enhancing reward models through data improvements, +following the conventional training framework for reward models that directly +optimizes the predicted rewards. In this paper, we propose a hybrid alignment +framework HaF-RM for reward model training by introducing an additional +constraint on token-level policy probabilities in addition to the reward score. +It can simultaneously supervise the internal preference model at the token +level and optimize the mapping layer of the reward model at the sequence level. +Theoretical justifications and experiment results on five datasets show the +validity and effectiveness of our proposed hybrid framework for training a +high-quality reward model. By decoupling the reward modeling procedure and +incorporating hybrid supervision, our HaF-RM framework offers a principled and +effective approach to enhancing the performance and alignment of reward models, +a critical component in the responsible development of powerful language +models. We release our code at https://haf-rm.github.io. + +
+
+
+
+
+ + ☆ Seeing Like an AI: How LLMs Apply (and Misapply) Wikipedia Neutrality + Norms + + +
+ Large language models (LLMs) are trained on broad corpora and then used in +communities with specialized norms. Is providing LLMs with community rules +enough for models to follow these norms? We evaluate LLMs' capacity to detect +(Task 1) and correct (Task 2) biased Wikipedia edits according to Wikipedia's +Neutral Point of View (NPOV) policy. LLMs struggled with bias detection, +achieving only 64% accuracy on a balanced dataset. Models exhibited contrasting +biases (some under- and others over-predicted bias), suggesting distinct priors +about neutrality. LLMs performed better at generation, removing 79% of words +removed by Wikipedia editors. However, LLMs made additional changes beyond +Wikipedia editors' simpler neutralizations, resulting in high-recall but +low-precision editing. Interestingly, crowdworkers rated AI rewrites as more +neutral (70%) and fluent (61%) than Wikipedia-editor rewrites. Qualitative +analysis found LLMs sometimes applied NPOV more comprehensively than Wikipedia +editors but often made extraneous non-NPOV-related changes (such as grammar). +LLMs may apply rules in ways that resonate with the public but diverge from +community experts. While potentially effective for generation, LLMs may reduce +editor agency and increase moderation workload (e.g., verifying additions). +Even when rules are easy to articulate, having LLMs apply them like community +members may still be difficult. + +
+
+
+
+
+ + ☆ Orchestrating LLMs with Different Personalizations + + +
+ This paper presents a novel approach to aligning large language models (LLMs) +with individual human preferences, sometimes referred to as Reinforcement +Learning from \textit{Personalized} Human Feedback (RLPHF). Given stated +preferences along multiple dimensions, such as helpfulness, conciseness, or +humor, the goal is to create an LLM without re-training that best adheres to +this specification. Starting from specialized expert LLMs, each trained for one +such particular preference dimension, we propose a black-box method that merges +their outputs on a per-token level. We train a lightweight Preference Control +Model (PCM) that dynamically translates the preference description and current +context into next-token prediction weights. By combining the expert models' +outputs at the token level, our approach dynamically generates text that +optimizes the given preference. Empirical tests show that our method matches or +surpasses existing preference merging techniques, providing a scalable, +efficient alternative to fine-tuning LLMs for individual personalization. + +
+
+
+
+
+ + ☆ Defense Against Syntactic Textual Backdoor Attacks with Token + Substitution + + +
+ Textual backdoor attacks present a substantial security risk to Large +Language Models (LLM). It embeds carefully chosen triggers into a victim model +at the training stage, and makes the model erroneously predict inputs +containing the same triggers as a certain class. Prior backdoor defense methods +primarily target special token-based triggers, leaving syntax-based triggers +insufficiently addressed. To fill this gap, this paper proposes a novel online +defense algorithm that effectively counters syntax-based as well as special +token-based backdoor attacks. The algorithm replaces semantically meaningful +words in sentences with entirely different ones but preserves the syntactic +templates or special tokens, and then compares the predicted labels before and +after the substitution to determine whether a sentence contains triggers. +Experimental results confirm the algorithm's performance against these two +types of triggers, offering a comprehensive defense strategy for model +integrity. + +
+
+
+
+
+ + ☆ ELCC: the Emergent Language Corpus Collection + + +
+ We introduce the Emergent Language Corpus Collection (ELCC): a collection of +corpora collected from open source implementations of emergent communication +systems across the literature. These systems include a variety of signalling +game environments as well as more complex tasks like a social deduction game +and embodied navigation. Each corpus is annotated with metadata describing the +characteristics of the source system as well as a suite of analyses of the +corpus (e.g., size, entropy, average message length). Currently, research +studying emergent languages requires directly running different systems which +takes time away from actual analyses of such languages, limits the variety of +languages that are studied, and presents a barrier to entry for researchers +without a background in deep learning. The availability of a substantial +collection of well-documented emergent language corpora, then, will enable new +directions of research which focus their purview on the properties of emergent +languages themselves rather than on experimental apparatus. + +
+
+ comment: 18 pages, 3 figures +
+
+
+
+
+ + ☆ Securing Multi-turn Conversational Language Models Against Distributed + Backdoor Triggers EMNLP 2024 + + +
+ The security of multi-turn conversational large language models (LLMs) is +understudied despite it being one of the most popular LLM utilization. +Specifically, LLMs are vulnerable to data poisoning backdoor attacks, where an +adversary manipulates the training data to cause the model to output malicious +responses to predefined triggers. Specific to the multi-turn dialogue setting, +LLMs are at the risk of even more harmful and stealthy backdoor attacks where +the backdoor triggers may span across multiple utterances, giving lee-way to +context-driven attacks. In this paper, we explore a novel distributed backdoor +trigger attack that serves to be an extra tool in an adversary's toolbox that +can interface with other single-turn attack strategies in a plug and play +manner. Results on two representative defense mechanisms indicate that +distributed backdoor triggers are robust against existing defense strategies +which are designed for single-turn user-model interactions, motivating us to +propose a new defense strategy for the multi-turn dialogue setting that is more +challenging. To this end, we also explore a novel contrastive decoding based +defense that is able to mitigate the backdoor with a low computational +tradeoff. + +
+
+ comment: Submitted to EMNLP 2024 +
+
+
+
+
+ + ☆ Towards Automating Text Annotation: A Case Study on Semantic Proximity + Annotation using GPT-4 + + +
+ This paper explores using GPT-3.5 and GPT-4 to automate the data annotation +process with automatic prompting techniques. The main aim of this paper is to +reuse human annotation guidelines along with some annotated data to design +automatic prompts for LLMs, focusing on the semantic proximity annotation task. +Automatic prompts are compared to customized prompts. We further implement the +prompting strategies into an open-source text annotation tool, enabling easy +online use via the OpenAI API. Our study reveals the crucial role of accurate +prompt design and suggests that prompting GPT-4 with human-like instructions is +not straightforwardly possible for the semantic proximity task. We show that +small modifications to the human guidelines already improve the performance, +suggesting possible ways for future research. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Query-Guided Self-Supervised Summarization of Nursing Notes + + +
+ Nursing notes, an important component of Electronic Health Records (EHRs), +keep track of the progression of a patient's health status during a care +episode. Distilling the key information in nursing notes through text +summarization techniques can improve clinicians' efficiency in understanding +patients' conditions when reviewing nursing notes. However, existing +abstractive summarization methods in the clinical setting have often overlooked +nursing notes and require the creation of reference summaries for supervision +signals, which is time-consuming. In this work, we introduce QGSumm, a +query-guided self-supervised domain adaptation framework for nursing note +summarization. Using patient-related clinical queries as guidance, our approach +generates high-quality, patient-centered summaries without relying on reference +summaries for training. Through automatic and manual evaluation by an expert +clinician, we demonstrate the strengths of our approach compared to the +state-of-the-art Large Language Models (LLMs) in both zero-shot and few-shot +settings. Ultimately, our approach provides a new perspective on conditional +text summarization, tailored to the specific interests of clinical personnel. + +
+
+
+
+
+ + ☆ Hallucination Detection: Robustly Discerning Reliable Answers in Large + Language Models CIKM 2023 + + +
+ Large Language Models (LLMs) have gained widespread adoption in various +natural language processing tasks, including question answering and dialogue +systems. However, a major drawback of LLMs is the issue of hallucination, where +they generate unfaithful or inconsistent content that deviates from the input +source, leading to severe consequences. In this paper, we propose a robust +discriminator named RelD to effectively detect hallucination in LLMs' generated +answers. RelD is trained on the constructed RelQA, a bilingual +question-answering dialogue dataset along with answers generated by LLMs and a +comprehensive set of metrics. Our experimental results demonstrate that the +proposed RelD successfully detects hallucination in the answers generated by +diverse LLMs. Moreover, it performs well in distinguishing hallucination in +LLMs' generated answers from both in-distribution and out-of-distribution +datasets. Additionally, we also conduct a thorough analysis of the types of +hallucinations that occur and present valuable insights. This research +significantly contributes to the detection of reliable answers generated by +LLMs and holds noteworthy implications for mitigating hallucination in the +future work. + +
+
+ comment: Accepted to CIKM 2023 (Long Paper) +
+
+
+
+
+ + ☆ MAPO: Boosting Large Language Model Performance with Model-Adaptive + Prompt Optimization EMNLP 2023 + + +
+ Prompt engineering, as an efficient and effective way to leverage Large +Language Models (LLM), has drawn a lot of attention from the research +community. The existing research primarily emphasizes the importance of +adapting prompts to specific tasks, rather than specific LLMs. However, a good +prompt is not solely defined by its wording, but also binds to the nature of +the LLM in question. In this work, we first quantitatively demonstrate that +different prompts should be adapted to different LLMs to enhance their +capabilities across various downstream tasks in NLP. Then we novelly propose a +model-adaptive prompt optimizer (MAPO) method that optimizes the original +prompts for each specific LLM in downstream tasks. Extensive experiments +indicate that the proposed method can effectively refine prompts for an LLM, +leading to significant improvements over various downstream tasks. + +
+
+ comment: Accepted to EMNLP 2023 (Findings) +
+
+
+
+
+ + ☆ MiniGPT-Med: Large Language Model as a General Interface for Radiology + Diagnosis + + +
+ Recent advancements in artificial intelligence (AI) have precipitated +significant breakthroughs in healthcare, particularly in refining diagnostic +procedures. However, previous studies have often been constrained to limited +functionalities. This study introduces MiniGPT-Med, a vision-language model +derived from large-scale language models and tailored for medical applications. +MiniGPT-Med demonstrates remarkable versatility across various imaging +modalities, including X-rays, CT scans, and MRIs, enhancing its utility. The +model is capable of performing tasks such as medical report generation, visual +question answering (VQA), and disease identification within medical imagery. +Its integrated processing of both image and textual clinical data markedly +improves diagnostic accuracy. Our empirical assessments confirm MiniGPT-Med's +superior performance in disease grounding, medical report generation, and VQA +benchmarks, representing a significant step towards reducing the gap in +assisting radiology practice. Furthermore, it achieves state-of-the-art +performance on medical report generation, higher than the previous best model +by 19\% accuracy. MiniGPT-Med promises to become a general interface for +radiology diagnoses, enhancing diagnostic efficiency across a wide range of +medical imaging applications. + +
+
+
+
+
+ + ☆ Can Pre-trained Language Models Understand Chinese Humor? WSDM 2022 + + +
+ Humor understanding is an important and challenging research in natural +language processing. As the popularity of pre-trained language models (PLMs), +some recent work makes preliminary attempts to adopt PLMs for humor recognition +and generation. However, these simple attempts do not substantially answer the +question: {\em whether PLMs are capable of humor understanding?} This paper is +the first work that systematically investigates the humor understanding ability +of PLMs. For this purpose, a comprehensive framework with three evaluation +steps and four evaluation tasks is designed. We also construct a comprehensive +Chinese humor dataset, which can fully meet all the data requirements of the +proposed evaluation framework. Our empirical study on the Chinese humor dataset +yields some valuable observations, which are of great guiding value for future +optimization of PLMs in humor understanding and generation. + +
+
+ comment: Accepted to WSDM 2022 +
+
+
+
+
+ + ☆ Stephanie: Step-by-Step Dialogues for Mimicking Human Interactions in + Social Conversations + + +
+ In the rapidly evolving field of natural language processing, dialogue +systems primarily employ a single-step dialogue paradigm. Although this +paradigm is efficient, it lacks the depth and fluidity of human interactions +and does not appear natural. We introduce a novel \textbf{Step}-by-Step +Dialogue Paradigm (Stephanie), designed to mimic the ongoing dynamic nature of +human conversations. By employing a dual learning strategy and a further-split +post-editing method, we generated and utilized a high-quality step-by-step +dialogue dataset to fine-tune existing large language models, enabling them to +perform step-by-step dialogues. We thoroughly present Stephanie. Tailored +automatic and human evaluations are conducted to assess its effectiveness +compared to the traditional single-step dialogue paradigm. We will release +code, Stephanie datasets, and Stephanie LLMs to facilitate the future of +chatbot eras. + +
+
+
+
+
+ + ☆ AXOLOTL'24 Shared Task on Multilingual Explainable Semantic Change + Modeling ACL'24 + + +
+ This paper describes the organization and findings of AXOLOTL'24, the first +multilingual explainable semantic change modeling shared task. We present new +sense-annotated diachronic semantic change datasets for Finnish and Russian +which were employed in the shared task, along with a surprise test-only German +dataset borrowed from an existing source. The setup of AXOLOTL'24 is new to the +semantic change modeling field, and involves subtasks of identifying unknown +(novel) senses and providing dictionary-like definitions to these senses. The +methods of the winning teams are described and compared, thus paving a path +towards explainability in computational approaches to historical change of +meaning. + +
+
+ comment: Proceedings of the 5th Workshop on Computational Approaches to + Historical Language Change (ACL'24) +
+
+
+
+
+ + ☆ DotaMath: Decomposition of Thought with Code Assistance and + Self-correction for Mathematical Reasoning + + +
+ Large language models (LLMs) have made impressive progress in handling simple +math problems, yet they still struggle with more challenging and complex +mathematical tasks. In this paper, we introduce a series of LLMs that employs +the Decomposition of thought with code assistance and self-correction for +mathematical reasoning, dubbed as DotaMath. DotaMath models tackle complex +mathematical tasks by decomposing them into simpler logical subtasks, +leveraging code to solve these subtasks, obtaining fine-grained feedback from +the code interpreter, and engaging in self-reflection and correction. By +annotating diverse interactive tool-use trajectories and employing query +evolution on GSM8K and MATH datasets, we generate an instruction fine-tuning +dataset called DotaMathQA with 574K query-response pairs. We train a series of +base LLMs using imitation learning on DotaMathQA, resulting in DotaMath models +that achieve remarkable performance compared to open-source LLMs across various +in-domain and out-of-domain benchmarks. Notably, DotaMath-deepseek-7B showcases +an outstanding performance of 64.8% on the competitive MATH dataset and 86.7% +on GSM8K. Besides, DotaMath-deepseek-7B maintains strong competitiveness on a +series of in-domain and out-of-domain benchmarks (Avg. 80.1%). Looking forward, +we anticipate that the DotaMath paradigm will open new pathways for addressing +intricate mathematical problems. Our code is publicly available at +https://github.com/ChengpengLi1003/DotaMath. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ A Systematic Survey and Critical Review on Evaluating Large Language + Models: Challenges, Limitations, and Recommendations + + +
+ Large Language Models (LLMs) have recently gained significant attention due +to their remarkable capabilities in performing diverse tasks across various +domains. However, a thorough evaluation of these models is crucial before +deploying them in real-world applications to ensure they produce reliable +performance. Despite the well-established importance of evaluating LLMs in the +community, the complexity of the evaluation process has led to varied +evaluation setups, causing inconsistencies in findings and interpretations. To +address this, we systematically review the primary challenges and limitations +causing these inconsistencies and unreliable evaluations in various steps of +LLM evaluation. Based on our critical review, we present our perspectives and +recommendations to ensure LLM evaluations are reproducible, reliable, and +robust. + +
+
+
+
+
+ + ☆ Semantic Graphs for Syntactic Simplification: A Revisit from the Age of + LLM ACL 2024 + + +
+ Symbolic sentence meaning representations, such as AMR (Abstract Meaning +Representation) provide expressive and structured semantic graphs that act as +intermediates that simplify downstream NLP tasks. However, the +instruction-following capability of large language models (LLMs) offers a +shortcut to effectively solve NLP tasks, questioning the utility of semantic +graphs. Meanwhile, recent work has also shown the difficulty of using meaning +representations merely as a helpful auxiliary for LLMs. We revisit the position +of semantic graphs in syntactic simplification, the task of simplifying +sentence structures while preserving their meaning, which requires semantic +understanding, and evaluate it on a new complex and natural dataset. The +AMR-based method that we propose, AMRS$^3$, demonstrates that state-of-the-art +meaning representations can lead to easy-to-implement simplification methods +with competitive performance and unique advantages in cost, interpretability, +and generalization. With AMRS$^3$ as an anchor, we discover that syntactic +simplification is a task where semantic graphs are helpful in LLM prompting. We +propose AMRCoC prompting that guides LLMs to emulate graph algorithms for +explicit symbolic reasoning on AMR graphs, and show its potential for improving +LLM on semantic-centered tasks like syntactic simplification. + +
+
+ comment: Accepted at TextGraphs-17 @ ACL 2024 +
+
+
+
+
+ + ☆ Deep Content Understanding Toward Entity and Aspect Target Sentiment + Analysis on Foundation Models + + +
+ Introducing Entity-Aspect Sentiment Triplet Extraction (EASTE), a novel +Aspect-Based Sentiment Analysis (ABSA) task which extends +Target-Aspect-Sentiment Detection (TASD) by separating aspect categories (e.g., +food#quality) into pre-defined entities (e.g., meal, drink) and aspects (e.g., +taste, freshness) which add a fine-gainer level of complexity, yet help +exposing true sentiment of chained aspect to its entity. We explore the task of +EASTE solving capabilities of language models based on transformers +architecture from our proposed unified-loss approach via token classification +task using BERT architecture to text generative models such as Flan-T5, +Flan-Ul2 to Llama2, Llama3 and Mixtral employing different alignment techniques +such as zero/few-shot learning, Parameter Efficient Fine Tuning (PEFT) such as +Low-Rank Adaptation (LoRA). The model performances are evaluated on the +SamEval-2016 benchmark dataset representing the fair comparison to existing +works. Our research not only aims to achieve high performance on the EASTE task +but also investigates the impact of model size, type, and adaptation techniques +on task performance. Ultimately, we provide detailed insights and achieving +state-of-the-art results in complex sentiment analysis. + +
+
+ comment: Proceedings of the 41 st International Conference on Machine + Learning, Vienna, Austria. PMLR 235, 2024. Copyright 2024 by the author(s) +
+
+
+
+
+ + ☆ Improving Accented Speech Recognition using Data Augmentation based on + Unsupervised Text-to-Speech Synthesis + + +
+ This paper investigates the use of unsupervised text-to-speech synthesis +(TTS) as a data augmentation method to improve accented speech recognition. TTS +systems are trained with a small amount of accented speech training data and +their pseudo-labels rather than manual transcriptions, and hence unsupervised. +This approach enables the use of accented speech data without manual +transcriptions to perform data augmentation for accented speech recognition. +Synthetic accented speech data, generated from text prompts by using the TTS +systems, are then combined with available non-accented speech data to train +automatic speech recognition (ASR) systems. ASR experiments are performed in a +self-supervised learning framework using a Wav2vec2.0 model which was +pre-trained on large amount of unsupervised accented speech data. The accented +speech data for training the unsupervised TTS are read speech, selected from +L2-ARCTIC and British Isles corpora, while spontaneous conversational speech +from the Edinburgh international accents of English corpus are used as the +evaluation data. Experimental results show that Wav2vec2.0 models which are +fine-tuned to downstream ASR task with synthetic accented speech data, +generated by the unsupervised TTS, yield up to 6.1% relative word error rate +reductions compared to a Wav2vec2.0 baseline which is fine-tuned with the +non-accented speech data from Librispeech corpus. + +
+
+ comment: Accepted to EUSIPCO 2024 +
+
+
+
+
+ + ☆ Systematic Task Exploration with LLMs: A Study in Citation Text + Generation ACL 2024 + + +
+ Large language models (LLMs) bring unprecedented flexibility in defining and +executing complex, creative natural language generation (NLG) tasks. Yet, this +flexibility brings new challenges, as it introduces new degrees of freedom in +formulating the task inputs and instructions and in evaluating model +performance. To facilitate the exploration of creative NLG tasks, we propose a +three-component research framework that consists of systematic input +manipulation, reference data, and output measurement. We use this framework to +explore citation text generation -- a popular scholarly NLP task that lacks +consensus on the task definition and evaluation metric and has not yet been +tackled within the LLM paradigm. Our results highlight the importance of +systematically investigating both task instruction and input configuration when +prompting LLMs, and reveal non-trivial relationships between different +evaluation metrics used for citation text generation. Additional human +generation and human evaluation experiments provide new qualitative insights +into the task to guide future research in citation text generation. We make our +code and data publicly available. + +
+
+ comment: Accepted to ACL 2024 (Main) +
+
+
+
+
+ + ☆ LLMAEL: Large Language Models are Good Context Augmenters for Entity + Linking + + +
+ Entity Linking (EL) models are well-trained at mapping mentions to their +corresponding entities according to a given context. However, EL models +struggle to disambiguate long-tail entities due to their limited training data. +Meanwhile, large language models (LLMs) are more robust at interpreting +uncommon mentions. Yet, due to a lack of specialized training, LLMs suffer at +generating correct entity IDs. Furthermore, training an LLM to perform EL is +cost-intensive. Building upon these insights, we introduce LLM-Augmented Entity +Linking LLMAEL, a plug-and-play approach to enhance entity linking through LLM +data augmentation. We leverage LLMs as knowledgeable context augmenters, +generating mention-centered descriptions as additional input, while preserving +traditional EL models for task specific processing. Experiments on 6 standard +datasets show that the vanilla LLMAEL outperforms baseline EL models in most +cases, while the fine-tuned LLMAEL set the new state-of-the-art results across +all 6 benchmarks. + +
+
+
+
+
+ + ☆ Exploring Diachronic and Diatopic Changes in Dialect Continua: Tasks, + Datasets and Challenges + + +
+ Everlasting contact between language communities leads to constant changes in +languages over time, and gives rise to language varieties and dialects. +However, the communities speaking non-standard language are often overlooked by +non-inclusive NLP technologies. Recently, there has been a surge of interest in +studying diatopic and diachronic changes in dialect NLP, but there is currently +no research exploring the intersection of both. Our work aims to fill this gap +by systematically reviewing diachronic and diatopic papers from a unified +perspective. In this work, we critically assess nine tasks and datasets across +five dialects from three language families (Slavic, Romance, and Germanic) in +both spoken and written modalities. The tasks covered are diverse, including +corpus construction, dialect distance estimation, and dialect geolocation +prediction, among others. Moreover, we outline five open challenges regarding +changes in dialect use over time, the reliability of dialect datasets, the +importance of speaker characteristics, limited coverage of dialects, and +ethical considerations in data collection. We hope that our work sheds light on +future research towards inclusive computational methods and datasets for +language varieties and dialects. + +
+
+ comment: LChange24 Camera Ready +
+
+
+
+
+ + ☆ Unlocking the Potential of Model Merging for Low-Resource Languages + + +
+ Adapting large language models (LLMs) to new languages typically involves +continual pre-training (CT) followed by supervised fine-tuning (SFT). However, +this CT-then-SFT approach struggles with limited data in the context of +low-resource languages, failing to balance language modeling and task-solving +capabilities. We thus propose model merging as an alternative for low-resource +languages, combining models with distinct capabilities into a single model +without additional training. We use model merging to develop task-solving LLMs +for low-resource languages without SFT data in the target languages. Our +experiments based on Llama-2-7B demonstrate that model merging effectively +endows LLMs for low-resource languages with task-solving abilities, +outperforming CT-then-SFT in scenarios with extremely scarce data. Observing +performance saturation in model merging with more training tokens, we further +analyze the merging process and introduce a slack variable to the model merging +algorithm to mitigate the loss of important parameters, thereby enhancing +performance. We hope that model merging can benefit more human languages +suffering from data scarcity with its higher data efficiency. + +
+
+
+
+
+ + ☆ A Survey on Natural Language Counterfactual Generation + + +
+ Natural Language Counterfactual generation aims to minimally modify a given +text such that the modified text will be classified into a different class. The +generated counterfactuals provide insight into the reasoning behind a model's +predictions by highlighting which words significantly influence the outcomes. +Additionally, they can be used to detect model fairness issues or augment the +training data to enhance the model's robustness. A substantial amount of +research has been conducted to generate counterfactuals for various NLP tasks, +employing different models and methodologies. With the rapid growth of studies +in this field, a systematic review is crucial to guide future researchers and +developers. To bridge this gap, this survey comprehensively overview textual +counterfactual generation methods, particularly including those based on Large +Language Models. We propose a new taxonomy that categorizes the generation +methods into four groups and systematically summarize the metrics for +evaluating the generation quality. Finally, we discuss ongoing research +challenges and outline promising directions for future work. + +
+
+ comment: A survey paper +
+
+
+
+
+ + ☆ Benchmarking Complex Instruction-Following with Multiple Constraints + Composition + + +
+ Instruction following is one of the fundamental capabilities of large +language models (LLMs). As the ability of LLMs is constantly improving, they +have been increasingly applied to deal with complex human instructions in +real-world scenarios. Therefore, how to evaluate the ability of complex +instruction-following of LLMs has become a critical research problem. Existing +benchmarks mainly focus on modeling different types of constraints in human +instructions while neglecting the composition of different constraints, which +is an indispensable constituent in complex instructions. To this end, we +propose ComplexBench, a benchmark for comprehensively evaluating the ability of +LLMs to follow complex instructions composed of multiple constraints. We +propose a hierarchical taxonomy for complex instructions, including 4 +constraint types, 19 constraint dimensions, and 4 composition types, and +manually collect a high-quality dataset accordingly. To make the evaluation +reliable, we augment LLM-based evaluators with rules to effectively verify +whether generated texts can satisfy each constraint and composition. +Furthermore, we obtain the final evaluation score based on the dependency +structure determined by different composition types. ComplexBench identifies +significant deficiencies in existing LLMs when dealing with complex +instructions with multiple constraints composition. + +
+
+ comment: 20 pages, 7 figures +
+
+
+
+
+ + ☆ LLM Roleplay: Simulating Human-Chatbot Interaction + + +
+ The development of chatbots requires collecting a large number of +human-chatbot dialogues to reflect the breadth of users' sociodemographic +backgrounds and conversational goals. However, the resource requirements to +conduct the respective user studies can be prohibitively high and often only +allow for a narrow analysis of specific dialogue goals and participant +demographics. In this paper, we propose LLM-Roleplay: a goal-oriented, +persona-based method to automatically generate diverse multi-turn dialogues +simulating human-chatbot interaction. LLM-Roleplay can be applied to generate +dialogues with any type of chatbot and uses large language models (LLMs) to +play the role of textually described personas. To validate our method we +collect natural human-chatbot dialogues from different sociodemographic groups +and conduct a human evaluation to compare real human-chatbot dialogues with our +generated dialogues. We compare the abilities of state-of-the-art LLMs in +embodying personas and holding a conversation and find that our method can +simulate human-chatbot dialogues with a high indistinguishability rate. + +
+
+
+
+
+ + ☆ Investigating the Role of Instruction Variety and Task Difficulty in + Robotic Manipulation Tasks + + +
+ Evaluating the generalisation capabilities of multimodal models based solely +on their performance on out-of-distribution data fails to capture their true +robustness. This work introduces a comprehensive evaluation framework that +systematically examines the role of instructions and inputs in the +generalisation abilities of such models, considering architectural design, +input perturbations across language and vision modalities, and increased task +complexity. The proposed framework uncovers the resilience of multimodal models +to extreme instruction perturbations and their vulnerability to observational +changes, raising concerns about overfitting to spurious correlations. By +employing this evaluation framework on current Transformer-based multimodal +models for robotic manipulation tasks, we uncover limitations and suggest +future advancements should focus on architectural and training innovations that +better integrate multimodal inputs, enhancing a model's generalisation prowess +by prioritising sensitivity to input content over incidental correlations. + +
+
+
+
+
+ + ☆ Improving Sample Efficiency of Reinforcement Learning with Background + Knowledge from Large Language Models + + +
+ Low sample efficiency is an enduring challenge of reinforcement learning +(RL). With the advent of versatile large language models (LLMs), recent works +impart common-sense knowledge to accelerate policy learning for RL processes. +However, we note that such guidance is often tailored for one specific task but +loses generalizability. In this paper, we introduce a framework that harnesses +LLMs to extract background knowledge of an environment, which contains general +understandings of the entire environment, making various downstream RL tasks +benefit from one-time knowledge representation. We ground LLMs by feeding a few +pre-collected experiences and requesting them to delineate background knowledge +of the environment. Afterward, we represent the output knowledge as potential +functions for potential-based reward shaping, which has a good property for +maintaining policy optimality from task rewards. We instantiate three variants +to prompt LLMs for background knowledge, including writing code, annotating +preferences, and assigning goals. Our experiments show that these methods +achieve significant sample efficiency improvements in a spectrum of downstream +tasks from Minigrid and Crafter domains. + +
+
+
+
+
+ + ☆ LLM-jp: A Cross-organizational Project for the Research and Development + of Fully Open Japanese LLMs + + +
+ This paper introduces LLM-jp, a cross-organizational project for the research +and development of Japanese large language models (LLMs). LLM-jp aims to +develop open-source and strong Japanese LLMs, and as of this writing, more than +1,500 participants from academia and industry are working together for this +purpose. This paper presents the background of the establishment of LLM-jp, +summaries of its activities, and technical reports on the LLMs developed by +LLM-jp. For the latest activities, visit https://llm-jp.nii.ac.jp/en/. + +
+
+
+
+
+ + ☆ Stark: Social Long-Term Multi-Modal Conversation with Persona + Commonsense Knowledge + + +
+ Humans share a wide variety of images related to their personal experiences +within conversations via instant messaging tools. However, existing works focus +on (1) image-sharing behavior in singular sessions, leading to limited +long-term social interaction, and (2) a lack of personalized image-sharing +behavior. In this work, we introduce Stark, a large-scale long-term multi-modal +conversation dataset that covers a wide range of social personas in a +multi-modality format, time intervals, and images. To construct Stark +automatically, we propose a novel multi-modal contextualization framework, Mcu, +that generates long-term multi-modal dialogue distilled from ChatGPT and our +proposed Plan-and-Execute image aligner. Using our Stark, we train a +multi-modal conversation model, Ultron 7B, which demonstrates impressive visual +imagination ability. Furthermore, we demonstrate the effectiveness of our +dataset in human evaluation. We make our source code and dataset publicly +available. + +
+
+ comment: Project website: https://stark-dataset.github.io +
+
+
+
+
+ + ☆ Solving Zebra Puzzles Using Constraint-Guided Multi-Agent Systems + + +
+ Prior research has enhanced the ability of Large Language Models (LLMs) to +solve logic puzzles using techniques such as chain-of-thought prompting or +introducing a symbolic representation. These frameworks are still usually +insufficient to solve complicated logical problems, such as Zebra puzzles, due +to the inherent complexity of translating natural language clues into logical +statements. We introduce a multi-agent system, ZPS, that integrates LLMs with +an off the shelf theorem prover. This system tackles the complex puzzle-solving +task by breaking down the problem into smaller, manageable parts, generating +SMT (Satisfiability Modulo Theories) code to solve them with a theorem prover, +and using feedback between the agents to repeatedly improve their answers. We +also introduce an automated grid puzzle grader to assess the correctness of our +puzzle solutions and show that the automated grader is reliable by evaluating +it in a user-study. Our approach shows improvement in all three LLMs we tested, +with GPT-4 showing 166% improvement in the number of fully correct solutions. + +
+
+
+
+
+ + ☆ Meta-prompting Optimized Retrieval-augmented Generation + + +
+ Retrieval-augmented generation resorts to content retrieved from external +sources in order to leverage the performance of large language models in +downstream tasks. The excessive volume of retrieved content, the possible +dispersion of its parts, or their out of focus range may happen nevertheless to +eventually have a detrimental rather than an incremental effect. To mitigate +this issue and improve retrieval-augmented generation, we propose a method to +refine the retrieved content before it is included in the prompt by resorting +to meta-prompting optimization. Put to empirical test with the demanding +multi-hop question answering task from the StrategyQA dataset, the evaluation +results indicate that this method outperforms a similar retrieval-augmented +system but without this method by over 30%. + +
+
+
+
+
+ + ☆ A framework for annotating and modelling intentions behind metaphor use + + +
+ Metaphors are part of everyday language and shape the way in which we +conceptualize the world. Moreover, they play a multifaceted role in +communication, making their understanding and generation a challenging task for +language models (LMs). While there has been extensive work in the literature +linking metaphor to the fulfilment of individual intentions, no comprehensive +taxonomy of such intentions, suitable for natural language processing (NLP) +applications, is available to present day. In this paper, we propose a novel +taxonomy of intentions commonly attributed to metaphor, which comprises 9 +categories. We also release the first dataset annotated for intentions behind +metaphor use. Finally, we use this dataset to test the capability of large +language models (LLMs) in inferring the intentions behind metaphor use, in +zero- and in-context few-shot settings. Our experiments show that this is still +a challenge for LLMs. + +
+
+
+
+
+ + ☆ Narrow Transformer: Starcoder-Based Java-LM For Desktop + + +
+ This paper presents NT-Java-1.1B, an open-source specialized code language +model built on StarCoderBase-1.1B, designed for coding tasks in Java +programming. NT-Java-1.1B achieves state-of-the-art performance, surpassing its +base model and majority of other models of similar size on MultiPL-E Java code +benchmark. While there have been studies on extending large, generic +pre-trained models to improve proficiency in specific programming languages +like Python, similar investigations on small code models for other programming +languages are lacking. Large code models require specialized hardware like GPUs +for inference, highlighting the need for research into building small code +models that can be deployed on developer desktops. This paper addresses this +research gap by focusing on the development of a small Java code model, +NT-Java-1.1B, and its quantized versions, which performs comparably to open +models around 1.1B on MultiPL-E Java code benchmarks, making them ideal for +desktop deployment. This paper establishes the foundation for specialized +models across languages and sizes for a family of NT Models. + +
+
+
+
+
+ + ☆ TongGu: Mastering Classical Chinese Understanding with + Knowledge-Grounded Large Language Models + + +
+ Classical Chinese is a gateway to the rich heritage and wisdom of ancient +China, yet its complexities pose formidable comprehension barriers for most +modern people without specialized knowledge. While Large Language Models (LLMs) +have shown remarkable capabilities in Natural Language Processing (NLP), they +struggle with Classical Chinese Understanding (CCU), especially in +data-demanding and knowledge-intensive tasks. In response to this dilemma, we +propose \textbf{TongGu} (mean understanding ancient and modern), the first +CCU-specific LLM, underpinned by three core contributions. First, we construct +a two-stage instruction-tuning dataset ACCN-INS derived from rich classical +Chinese corpora, aiming to unlock the full CCU potential of LLMs. Second, we +propose Redundancy-Aware Tuning (RAT) to prevent catastrophic forgetting, +enabling TongGu to acquire new capabilities while preserving its foundational +knowledge. Third, we present a CCU Retrieval-Augmented Generation (CCU-RAG) +technique to reduce hallucinations based on knowledge-grounding. Extensive +experiments across 24 diverse CCU tasks validate TongGu's superior ability, +underscoring the effectiveness of RAT and CCU-RAG. The model and dataset will +be public available. + +
+
+
+
+
+ + ☆ Entity-Level Sentiment: More than the Sum of Its Parts WASSA 2024 + + +
+ In sentiment analysis of longer texts, there may be a variety of topics +discussed, of entities mentioned, and of sentiments expressed regarding each +entity. We find a lack of studies exploring how such texts express their +sentiment towards each entity of interest, and how these sentiments can be +modelled. In order to better understand how sentiment regarding persons and +organizations (each entity in our scope) is expressed in longer texts, we have +collected a dataset of expert annotations where the overall sentiment regarding +each entity is identified, together with the sentence-level sentiment for these +entities separately. We show that the reader's perceived sentiment regarding an +entity often differs from an arithmetic aggregation of sentiments at the +sentence level. Only 70\% of the positive and 55\% of the negative entities +receive a correct overall sentiment label when we aggregate the +(human-annotated) sentiment labels for the sentences where the entity is +mentioned. Our dataset reveals the complexity of entity-specific sentiment in +longer texts, and allows for more precise modelling and evaluation of such +sentiment expressions. + +
+
+ comment: 14th Workshop on Computational Approaches to Subjectivity, Sentiment + & Social Media Analysis (WASSA 2024) +
+
+
+
+
+ + ☆ Scoping Review of Active Learning Strategies and their Evaluation + Environments for Entity Recognition Tasks + + +
+ We conducted a scoping review for active learning in the domain of natural +language processing (NLP), which we summarize in accordance with the PRISMA-ScR +guidelines as follows: + Objective: Identify active learning strategies that were proposed for entity +recognition and their evaluation environments (datasets, metrics, hardware, +execution time). Design: We used Scopus and ACM as our search engines. We +compared the results with two literature surveys to assess the search quality. +We included peer-reviewed English publications introducing or comparing active +learning strategies for entity recognition. Results: We analyzed 62 relevant +papers and identified 106 active learning strategies. We grouped them into +three categories: exploitation-based (60x), exploration-based (14x), and hybrid +strategies (32x). We found that all studies used the F1-score as an evaluation +metric. Information about hardware (6x) and execution time (13x) was only +occasionally included. The 62 papers used 57 different datasets to evaluate +their respective strategies. Most datasets contained newspaper articles or +biomedical/medical data. Our analysis revealed that 26 out of 57 datasets are +publicly accessible. + Conclusion: Numerous active learning strategies have been identified, along +with significant open questions that still need to be addressed. Researchers +and practitioners face difficulties when making data-driven decisions about +which active learning strategy to adopt. Conducting comprehensive empirical +comparisons using the evaluation environment proposed in this study could help +establish best practices in the domain. + +
+
+ comment: The Version of Record of this contribution is published in Deep + Learning Theory and Applications 5th International Conference, DeLTA 2024 + Proceedings, and will be available after the conference +
+
+
+
+
+ + ☆ Planning with Large Language Models for Conversational Agents + + +
+ Controllability and proactivity are crucial properties of autonomous +conversational agents (CAs). Controllability requires the CAs to follow the +standard operating procedures (SOPs), such as verifying identity before +activating credit cards. Proactivity requires the CAs to guide the conversation +towards the goal during user uncooperation, such as persuasive dialogue. +Existing research cannot be unified with controllability, proactivity, and low +manual annotation. To bridge this gap, we propose a new framework for +planning-based conversational agents (PCA) powered by large language models +(LLMs), which only requires humans to define tasks and goals for the LLMs. +Before conversation, LLM plans the core and necessary SOP for dialogue offline. +During the conversation, LLM plans the best action path online referring to the +SOP, and generates responses to achieve process controllability. Subsequently, +we propose a semi-automatic dialogue data creation framework and curate a +high-quality dialogue dataset (PCA-D). Meanwhile, we develop multiple variants +and evaluation metrics for PCA, e.g., planning with Monte Carlo Tree Search +(PCA-M), which searches for the optimal dialogue action while satisfying SOP +constraints and achieving the proactive of the dialogue. Experiment results +show that LLMs finetuned on PCA-D can significantly improve the performance and +generalize to unseen domains. PCA-M outperforms other CoT and ToT baselines in +terms of conversation controllability, proactivity, task success rate, and +overall logical coherence, and is applicable in industry dialogue scenarios. +The dataset and codes are available at XXXX. + +
+
+
+
+
+ + ☆ DART: Deep Adversarial Automated Red Teaming for LLM Safety + + +
+ Manual Red teaming is a commonly-used method to identify vulnerabilities in +large language models (LLMs), which, is costly and unscalable. In contrast, +automated red teaming uses a Red LLM to automatically generate adversarial +prompts to the Target LLM, offering a scalable way for safety vulnerability +detection. However, the difficulty of building a powerful automated Red LLM +lies in the fact that the safety vulnerabilities of the Target LLM are +dynamically changing with the evolution of the Target LLM. To mitigate this +issue, we propose a Deep Adversarial Automated Red Teaming (DART) framework in +which the Red LLM and Target LLM are deeply and dynamically interacting with +each other in an iterative manner. In each iteration, in order to generate +successful attacks as many as possible, the Red LLM not only takes into account +the responses from the Target LLM, but also adversarially adjust its attacking +directions by monitoring the global diversity of generated attacks across +multiple iterations. Simultaneously, to explore dynamically changing safety +vulnerabilities of the Target LLM, we allow the Target LLM to enhance its +safety via an active learning based data selection mechanism. Experimential +results demonstrate that DART significantly reduces the safety risk of the +target LLM. For human evaluation on Anthropic Harmless dataset, compared to the +instruction-tuning target LLM, DART eliminates the violation risks by 53.4\%. +We will release the datasets and codes of DART soon. + +
+
+
+
+
+ + ☆ TartuNLP @ AXOLOTL-24: Leveraging Classifier Output for New Sense + Detection in Lexical Semantics + + +
+ We present our submission to the AXOLOTL-24 shared task. The shared task +comprises two subtasks: identifying new senses that words gain with time (when +comparing newer and older time periods) and producing the definitions for the +identified new senses. We implemented a conceptually simple and computationally +inexpensive solution to both subtasks. We trained adapter-based binary +classification models to match glosses with usage examples and leveraged the +probability output of the models to identify novel senses. The same models were +used to match examples of novel sense usages with Wiktionary definitions. Our +submission attained third place on the first subtask and the first place on the +second subtask. + +
+
+ comment: Accepted to the 5th International Workshop on Computational + Approaches to Historical Language Change 2024 (LChange'24) +
+
+
+
+
+ + ☆ Anthropocentric bias and the possibility of artificial cognition ICML 2024 + + +
+ Evaluating the cognitive capacities of large language models (LLMs) requires +overcoming not only anthropomorphic but also anthropocentric biases. This +article identifies two types of anthropocentric bias that have been neglected: +overlooking how auxiliary factors can impede LLM performance despite competence +(Type-I), and dismissing LLM mechanistic strategies that differ from those of +humans as not genuinely competent (Type-II). Mitigating these biases +necessitates an empirically-driven, iterative approach to mapping cognitive +tasks to LLM-specific capacities and mechanisms, which can be done by +supplementing carefully designed behavioral experiments with mechanistic +studies. + +
+
+ comment: Accepted for ICML 2024 (Workshop on Large Language Models and + Cognition) +
+
+
+
+
+ + ☆ HYBRINFOX at CheckThat! 2024 -- Task 1: Enhancing Language Models with + Structured Information for Check-Worthiness Estimation + + +
+ This paper summarizes the experiments and results of the HYBRINFOX team for +the CheckThat! 2024 - Task 1 competition. We propose an approach enriching +Language Models such as RoBERTa with embeddings produced by triples (subject ; +predicate ; object) extracted from the text sentences. Our analysis of the +developmental data shows that this method improves the performance of Language +Models alone. On the evaluation data, its best performance was in English, +where it achieved an F1 score of 71.1 and ranked 12th out of 27 candidates. On +the other languages (Dutch and Arabic), it obtained more mixed results. Future +research tracks are identified toward adapting this processing pipeline to more +recent Large Language Models. + +
+
+ comment: Paper to appear in the Proceedings of the Conference and Labs of the + Evaluation Forum (CLEF 2024 CheckThat!) +
+
+
+
+
+ + ☆ On the Benchmarking of LLMs for Open-Domain Dialogue Evaluation ACL + + +
+ Large Language Models (LLMs) have showcased remarkable capabilities in +various Natural Language Processing tasks. For automatic open-domain dialogue +evaluation in particular, LLMs have been seamlessly integrated into evaluation +frameworks, and together with human evaluation, compose the backbone of most +evaluations. However, existing evaluation benchmarks often rely on outdated +datasets and evaluate aspects like Fluency and Relevance, which fail to +adequately capture the capabilities and limitations of state-of-the-art chatbot +models. + This paper critically examines current evaluation benchmarks, highlighting +that the use of older response generators and quality aspects fail to +accurately reflect modern chatbot capabilities. A small annotation experiment +on a recent LLM-generated dataset (SODA) reveals that LLM evaluators such as +GPT-4 struggle to detect actual deficiencies in dialogues generated by current +LLM chatbots. + +
+
+ comment: Accepted to the 6th NLP for Conversational AI workshop at ACL +
+
+
+
+
+ + ☆ ConText at WASSA 2024 Empathy and Personality Shared Task: + History-Dependent Embedding Utterance Representations for Empathy and Emotion + Prediction in Conversations WASSA'24 + + +
+ Empathy and emotion prediction are key components in the development of +effective and empathetic agents, amongst several other applications. The WASSA +shared task on empathy and emotion prediction in interactions presents an +opportunity to benchmark approaches to these tasks. Appropriately selecting and +representing the historical context is crucial in the modelling of empathy and +emotion in conversations. In our submissions, we model empathy, emotion +polarity and emotion intensity of each utterance in a conversation by feeding +the utterance to be classified together with its conversational context, i.e., +a certain number of previous conversational turns, as input to an encoder +Pre-trained Language Model, to which we append a regression head for +prediction. We also model perceived counterparty empathy of each interlocutor +by feeding all utterances from the conversation and a token identifying the +interlocutor for which we are predicting the empathy. Our system officially +ranked $1^{st}$ at the CONV-turn track and $2^{nd}$ at the CONV-dialog track. + +
+
+ comment: WASSA'24 +
+
+
+
+
+ + ☆ Finetuning End-to-End Models for Estonian Conversational Spoken Language + Translation ACL + + +
+ This paper investigates the finetuning of end-to-end models for bidirectional +Estonian-English and Estonian-Russian conversational speech-to-text +translation. Due to the limited availability of speech translation data for +Estonian, we created additional training data by web scraping and synthesizing +data from speech recognition datasets using machine translation. We evaluated +three publicly available end-to-end models: Whisper, OWSM 3.1, and SeamlessM4T. +Our results indicate that fine-tuning with synthetic data enhances translation +accuracy by a large margin, with SeamlessM4T matching or surpassing cascaded +speech translation systems that use state-of-the-art speech recognition and +machine translation models. + +
+
+ comment: Accepted to LoResMT 2024 (ACL workshop) +
+
+
+
+
+ + ☆ Cognitive Modeling with Scaffolded LLMs: A Case Study of Referential + Expression Generation ICML 2024 + + +
+ To what extent can LLMs be used as part of a cognitive model of language +generation? In this paper, we approach this question by exploring a +neuro-symbolic implementation of an algorithmic cognitive model of referential +expression generation by Dale & Reiter (1995). The symbolic task analysis +implements the generation as an iterative procedure that scaffolds symbolic and +gpt-3.5-turbo-based modules. We compare this implementation to an ablated model +and a one-shot LLM-only baseline on the A3DS dataset (Tsvilodub & Franke, +2023). We find that our hybrid approach is cognitively plausible and performs +well in complex contexts, while allowing for more open-ended modeling of +language generation in a larger domain. + +
+
+ comment: 11 pages, 3 figures, 2 algorithms, to appear at the ICML 2024 + workshop on Large Language Models and Cognition +
+
+
+
+
+ + ☆ M$\mathbf5$ -- A Diverse Benchmark to Assess the Performance of Large + Multimodal Models Across Multilingual and Multicultural Vision-Language Tasks + + +
+ Since the release of ChatGPT, the field of Natural Language Processing has +experienced rapid advancements, particularly in Large Language Models (LLMs) +and their multimodal counterparts, Large Multimodal Models (LMMs). Despite +their impressive capabilities, LLMs often exhibit significant performance +disparities across different languages and cultural contexts, as demonstrated +by various text-only benchmarks. However, current research lacks such +benchmarks for multimodal visio-linguistic settings. This work fills this gap +by introducing M5, the first comprehensive benchmark designed to evaluate LMMs +on diverse vision-language tasks within a multilingual and multicultural +context. M5 includes eight datasets covering five tasks and $41$ languages, +with a focus on underrepresented languages and culturally diverse images. +Furthermore, we introduce two novel datasets, M5-VGR and M5-VLOD, including a +new Visio-Linguistic Outlier Detection task, in which all evaluated open-source +models fail to significantly surpass the random baseline. Through extensive +evaluation and analyses, we highlight substantial task-agnostic performance +disparities between high- and low-resource languages. Moreover, we show that +larger models do not necessarily outperform smaller ones in a multilingual +setting. + +
+
+
+
+
+ + ☆ Meta-optimized Angular Margin Contrastive Framework for Video-Language + Representation Learning ECCV 2024 + + +
+ Data quality stands at the forefront of deciding the effectiveness of +video-language representation learning. However, video-text pairs in previous +data typically do not align perfectly with each other, which might lead to +video-language representations that do not accurately reflect cross-modal +semantics. Moreover, previous data also possess an uneven distribution of +concepts, thereby hampering the downstream performance across unpopular +subjects. To address these problems, we propose a contrastive objective with a +subtractive angular margin to regularize cross-modal representations in their +effort to reach perfect similarity. Furthermore, to adapt to the non-uniform +concept distribution, we propose a multi-layer perceptron (MLP)-parameterized +weighting function that maps loss values to sample weights which enable dynamic +adjustment of the model's focus throughout the training. With the training +guided by a small amount of unbiased meta-data and augmented by video-text data +generated by large vision-language model, we improve video-language +representations and achieve superior performances on commonly used video +question answering and text-video retrieval datasets. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Functional Faithfulness in the Wild: Circuit Discovery with + Differentiable Computation Graph Pruning + + +
+ In this paper, we introduce a comprehensive reformulation of the task known +as Circuit Discovery, along with DiscoGP, a novel and effective algorithm based +on differentiable masking for discovering circuits. Circuit discovery is the +task of interpreting the computational mechanisms of language models (LMs) by +dissecting their functions and capabilities into sparse subnetworks (circuits). +We identified two major limitations in existing circuit discovery efforts: (1) +a dichotomy between weight-based and connection-edge-based approaches forces +researchers to choose between pruning connections or weights, thereby limiting +the scope of mechanistic interpretation of LMs; (2) algorithms based on +activation patching tend to identify circuits that are neither functionally +faithful nor complete. The performance of these identified circuits is +substantially reduced, often resulting in near-random performance in isolation. +Furthermore, the complement of the circuit -- i.e., the original LM with the +identified circuit removed -- still retains adequate performance, indicating +that essential components of a complete circuits are missed by existing +methods. + DiscoGP successfully addresses the two aforementioned issues and demonstrates +state-of-the-art faithfulness, completeness, and sparsity. The effectiveness of +the algorithm and its novel structure open up new avenues of gathering new +insights into the internal workings of generative AI. + +
+
+
+
+
+ + ☆ HYBRINFOX at CheckThat! 2024 -- Task 2: Enriching BERT Models with the + Expert System VAGO for Subjectivity Detection + + +
+ This paper presents the HYBRINFOX method used to solve Task 2 of Subjectivity +detection of the CLEF 2024 CheckThat! competition. The specificity of the +method is to use a hybrid system, combining a RoBERTa model, fine-tuned for +subjectivity detection, a frozen sentence-BERT (sBERT) model to capture +semantics, and several scores calculated by the English version of the expert +system VAGO, developed independently of this task to measure vagueness and +subjectivity in texts based on the lexicon. In English, the HYBRINFOX method +ranked 1st with a macro F1 score of 0.7442 on the evaluation data. For the +other languages, the method used a translation step into English, producing +more mixed results (ranking 1st in Multilingual and 2nd in Italian over the +baseline, but under the baseline in Bulgarian, German, and Arabic). We explain +the principles of our hybrid approach, and outline ways in which the method +could be improved for other languages besides English. + +
+
+ comment: To appear in the Proceedings of the Conference and Labs of the + Evaluation Forum (CLEF 2024 CheckThat!) +
+
+
+
+
+ + ☆ Argument Mining in Data Scarce Settings: Cross-lingual Transfer and + Few-shot Techniques + + +
+ Recent research on sequence labelling has been exploring different strategies +to mitigate the lack of manually annotated data for the large majority of the +world languages. Among others, the most successful approaches have been based +on (i) the cross-lingual transfer capabilities of multilingual pre-trained +language models (model-transfer), (ii) data translation and label projection +(data-transfer) and (iii), prompt-based learning by reusing the mask objective +to exploit the few-shot capabilities of pre-trained language models (few-shot). +Previous work seems to conclude that model-transfer outperforms data-transfer +methods and that few-shot techniques based on prompting are superior to +updating the model's weights via fine-tuning. In this paper, we empirically +demonstrate that, for Argument Mining, a sequence labelling task which requires +the detection of long and complex discourse structures, previous insights on +cross-lingual transfer or few-shot learning do not apply. Contrary to previous +work, we show that for Argument Mining data transfer obtains better results +than model-transfer and that fine-tuning outperforms few-shot methods. +Regarding the former, the domain of the dataset used for data-transfer seems to +be a deciding factor, while, for few-shot, the type of task (length and +complexity of the sequence spans) and sampling method prove to be crucial. + +
+
+
+
+
+ + ☆ Improving Self-supervised Pre-training using Accent-Specific Codebooks INTERSPEECH 2024 + + +
+ Speech accents present a serious challenge to the performance of +state-of-the-art end-to-end Automatic Speech Recognition (ASR) systems. Even +with self-supervised learning and pre-training of ASR models, accent invariance +is seldom achieved. In this work, we propose an accent-aware adaptation +technique for self-supervised learning that introduces a trainable set of +accent-specific codebooks to the self-supervised architecture. These learnable +codebooks enable the model to capture accent specific information during +pre-training, that is further refined during ASR finetuning. On the Mozilla +Common Voice dataset, our proposed approach outperforms all other +accent-adaptation approaches on both seen and unseen English accents, with up +to 9% relative reduction in word error rate (WER). + +
+
+ comment: Accepted to INTERSPEECH 2024 +
+
+
+
+
+ + ☆ Query-oriented Data Augmentation for Session Search + + +
+ Modeling contextual information in a search session has drawn more and more +attention when understanding complex user intents. Recent methods are all +data-driven, i.e., they train different models on large-scale search log data +to identify the relevance between search contexts and candidate documents. The +common training paradigm is to pair the search context with different candidate +documents and train the model to rank the clicked documents higher than the +unclicked ones. However, this paradigm neglects the symmetric nature of the +relevance between the session context and document, i.e., the clicked documents +can also be paired with different search contexts when training. In this work, +we propose query-oriented data augmentation to enrich search logs and empower +the modeling. We generate supplemental training pairs by altering the most +important part of a search context, i.e., the current query, and train our +model to rank the generated sequence along with the original sequence. This +approach enables models to learn that the relevance of a document may vary as +the session context changes, leading to a better understanding of users' search +patterns. We develop several strategies to alter the current query, resulting +in new training data with varying degrees of difficulty. Through +experimentation on two extensive public search logs, we have successfully +demonstrated the effectiveness of our model. + +
+
+ comment: TKDE 2024 +
+
+
+
+
+ + ☆ Multi-Convformer: Extending Conformer with Multiple Convolution Kernels INTERSPEECH 2024 + + +
+ Convolutions have become essential in state-of-the-art end-to-end Automatic +Speech Recognition~(ASR) systems due to their efficient modelling of local +context. Notably, its use in Conformers has led to superior performance +compared to vanilla Transformer-based ASR systems. While components other than +the convolution module in the Conformer have been reexamined, altering the +convolution module itself has been far less explored. Towards this, we +introduce Multi-Convformer that uses multiple convolution kernels within the +convolution module of the Conformer in conjunction with gating. This helps in +improved modeling of local dependencies at varying granularities. Our model +rivals existing Conformer variants such as CgMLP and E-Branchformer in +performance, while being more parameter efficient. We empirically compare our +approach with Conformer and its variants across four different datasets and +three different modelling paradigms and show up to 8% relative word error +rate~(WER) improvements. + +
+
+ comment: Accepted to INTERSPEECH 2024 +
+
+
+
+
+ + ☆ Text2TimeSeries: Enhancing Financial Forecasting through Time Series + Prediction Updates with Event-Driven Insights from Large Language Models + + +
+ Time series models, typically trained on numerical data, are designed to +forecast future values. These models often rely on weighted averaging +techniques over time intervals. However, real-world time series data is seldom +isolated and is frequently influenced by non-numeric factors. For instance, +stock price fluctuations are impacted by daily random events in the broader +world, with each event exerting a unique influence on price signals. +Previously, forecasts in financial markets have been approached in two main +ways: either as time-series problems over price sequence or sentiment analysis +tasks. The sentiment analysis tasks aim to determine whether news events will +have a positive or negative impact on stock prices, often categorizing them +into discrete labels. Recognizing the need for a more comprehensive approach to +accurately model time series prediction, we propose a collaborative modeling +framework that incorporates textual information about relevant events for +predictions. Specifically, we leverage the intuition of large language models +about future changes to update real number time series predictions. We +evaluated the effectiveness of our approach on financial market data. + +
+
+ comment: 21 pages, 12 figures +
+
+
+
+
+ + ☆ STOC-TOT: Stochastic Tree-of-Thought with Constrained Decoding for + Complex Reasoning in Multi-Hop Question Answering + + +
+ Multi-hop question answering (MHQA) requires a model to retrieve and +integrate information from multiple passages to answer a complex question. +Recent systems leverage the power of large language models and integrate +evidence retrieval with reasoning prompts (e.g., chain-of-thought reasoning) +for the MHQA task. However, the complexities in the question types (bridge v.s. +comparison questions) and the reasoning types (sequential v.s. parallel +reasonings) require more novel and fine-grained prompting methods to enhance +the performance of MHQA under the zero-shot setting. In this paper, we propose +STOC-TOT, a stochastic tree-of-thought reasoning prompting method with +constrained decoding for MHQA and conduct a detailed comparison with other +reasoning prompts on different question types and reasoning types. +Specifically, we construct a tree-like reasoning structure by prompting the +model to break down the original question into smaller sub-questions to form +different reasoning paths. In addition, we prompt the model to provide a +probability estimation for each reasoning path at each reasoning step. At +answer time, we conduct constrained decoding on the model to generate more +grounded answers and reduce hallucination. Experiments comparing STOC-TOT with +two MHQA datasets and five large language models showed that our framework +outperforms other reasoning prompts by a significant margin. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Improving Self Consistency in LLMs through Probabilistic Tokenization ICML 2024 + + +
+ Prior research has demonstrated noticeable performance gains through the use +of probabilistic tokenizations, an approach that involves employing multiple +tokenizations of the same input string during the training phase of a language +model. Despite these promising findings, modern large language models (LLMs) +have yet to be trained using probabilistic tokenizations. Interestingly, while +the tokenizers of these contemporary LLMs have the capability to generate +multiple tokenizations, this property remains underutilized. + In this work, we propose a novel method to leverage the multiple tokenization +capabilities of modern LLM tokenizers, aiming to enhance the self-consistency +of LLMs in reasoning tasks. Our experiments indicate that when utilizing +probabilistic tokenizations, LLMs generate logically diverse reasoning paths, +moving beyond mere surface-level linguistic diversity.We carefully study +probabilistic tokenization and offer insights to explain the self consistency +improvements it brings through extensive experimentation on 5 LLM families and +4 reasoning benchmarks. + +
+
+ comment: ICML 2024 Workshop on LLMs and Cognition +
+
+
+
+
+ + ♻ ☆ What Does the Bot Say? Opportunities and Risks of Large Language Models + in Social Media Bot Detection ACL 2024 + + +
+ Social media bot detection has always been an arms race between advancements +in machine learning bot detectors and adversarial bot strategies to evade +detection. In this work, we bring the arms race to the next level by +investigating the opportunities and risks of state-of-the-art large language +models (LLMs) in social bot detection. To investigate the opportunities, we +design novel LLM-based bot detectors by proposing a +mixture-of-heterogeneous-experts framework to divide and conquer diverse user +information modalities. To illuminate the risks, we explore the possibility of +LLM-guided manipulation of user textual and structured information to evade +detection. Extensive experiments with three LLMs on two datasets demonstrate +that instruction tuning on merely 1,000 annotated examples produces specialized +LLMs that outperform state-of-the-art baselines by up to 9.1% on both datasets, +while LLM-guided manipulation strategies could significantly bring down the +performance of existing bot detectors by up to 29.6% and harm the calibration +and reliability of bot detection systems. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ LOGIC-LM++: Multi-Step Refinement for Symbolic Formulations + + +
+ In this paper we examine the limitations of Large Language Models (LLMs) for +complex reasoning tasks. Although recent works have started to employ formal +languages as an intermediate representation for reasoning tasks, they often +face challenges in accurately generating and refining these formal +specifications to ensure correctness. To address these issues, this paper +proposes Logic-LM++, an improvement on Logic-LM . It uses the ability of LLMs +to do pairwise comparisons, allowing the evaluation of the refinements +suggested by the LLM. The paper demonstrates that Logic-LM++ outperforms +Logic-LM and other contemporary techniques across natural language reasoning +tasks on three datasets, FOLIO, ProofWriter and AR-LSAT, with an average +improvement of 18.5% on standard prompting, 12.3% on chain of thought prompting +and 5% on Logic-LM. + +
+
+
+
+
+ + ♻ ☆ NLP-KG: A System for Exploratory Search of Scientific Literature in + Natural Language Processing ACL 2024 + + +
+ Scientific literature searches are often exploratory, whereby users are not +yet familiar with a particular field or concept but are interested in learning +more about it. However, existing systems for scientific literature search are +typically tailored to keyword-based lookup searches, limiting the possibilities +for exploration. We propose NLP-KG, a feature-rich system designed to support +the exploration of research literature in unfamiliar natural language +processing (NLP) fields. In addition to a semantic search, NLP-KG allows users +to easily find survey papers that provide a quick introduction to a field of +interest. Further, a Fields of Study hierarchy graph enables users to +familiarize themselves with a field and its related areas. Finally, a chat +interface allows users to ask questions about unfamiliar concepts or specific +articles in NLP and obtain answers grounded in knowledge retrieved from +scientific publications. Our system provides users with comprehensive +exploration possibilities, supporting them in investigating the relationships +between different fields, understanding unfamiliar concepts in NLP, and finding +relevant research literature. Demo, video, and code are available at: +https://github.com/NLP-Knowledge-Graph/NLP-KG-WebApp. + +
+
+ comment: Accepted to ACL 2024 System Demonstrations +
+
+
+
+
+ + ♻ ☆ LoRA+: Efficient Low Rank Adaptation of Large Models + + +
+ In this paper, we show that Low Rank Adaptation (LoRA) as originally +introduced in Hu et al. (2021) leads to suboptimal finetuning of models with +large width (embedding dimension). This is due to the fact that adapter +matrices A and B in LoRA are updated with the same learning rate. Using scaling +arguments for large width networks, we demonstrate that using the same learning +rate for A and B does not allow efficient feature learning. We then show that +this suboptimality of LoRA can be corrected simply by setting different +learning rates for the LoRA adapter matrices A and B with a well-chosen ratio. +We call this proposed algorithm LoRA$+$. In our extensive experiments, LoRA$+$ +improves performance (1-2 $\%$ improvements) and finetuning speed (up to $\sim$ +2X SpeedUp), at the same computational cost as LoRA. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ♻ ☆ Conditional and Modal Reasoning in Large Language Models + + +
+ The reasoning abilities of large language models (LLMs) are the topic of a +growing body of research in AI and cognitive science. In this paper, we probe +the extent to which twenty-five LLMs are able to distinguish logically correct +inferences from logically fallacious ones. We focus on inference patterns +involving conditionals (e.g., 'If Ann has a queen, then Bob has a jack') and +epistemic modals (e.g., 'Ann might have an ace', 'Bob must have a king'). These +inferences have been of special interest to logicians, philosophers, and +linguists, since they play a central role in the fundamental human ability to +reason about distal possibilities. Assessing LLMs on these inferences is thus +highly relevant to the question of how much the reasoning abilities of LLMs +match those of humans. Among the LLMs we tested, all but the GPT-4 model family +often make basic mistakes with conditionals, though zero-shot chain-of-thought +prompting helps them make fewer mistakes. Moreover, even the GPT-4 family +displays logically inconsistent judgments across inference patterns involving +epistemic modals, and almost all models give answers to certain complex +conditional inferences widely discussed in the literature that do not match +human judgments. These results highlight gaps in basic logical reasoning in +today's LLMs. + +
+
+ comment: Updated version with results from 25 LLMs, additional few-shot and + chain-of-thought prompts, additional inference patterns, and correlations + with other benchmarks +
+
+
+
+
+ + ♻ ☆ L+M-24: Building a Dataset for Language + Molecules @ ACL 2024 + + +
+ Language-molecule models have emerged as an exciting direction for molecular +discovery and understanding. However, training these models is challenging due +to the scarcity of molecule-language pair datasets. At this point, datasets +have been released which are 1) small and scraped from existing databases, 2) +large but noisy and constructed by performing entity linking on the scientific +literature, and 3) built by converting property prediction datasets to natural +language using templates. In this document, we detail the $\textit{L+M-24}$ +dataset, which has been created for the Language + Molecules Workshop shared +task at ACL 2024. In particular, $\textit{L+M-24}$ is designed to focus on +three key benefits of natural language in molecule design: compositionality, +functionality, and abstraction. + +
+
+ comment: The dataset, finetuned baselines, and evaluation code are released + publicly at https://github.com/language-plus-molecules/LPM-24-Dataset through + https://huggingface.co/language-plus-molecules +
+
+
+
+
+ + ♻ ☆ How AI Ideas Affect the Creativity, Diversity, and Evolution of Human + Ideas: Evidence From a Large, Dynamic Experiment + + +
+ Exposure to large language model output is rapidly increasing. How will +seeing AI-generated ideas affect human ideas? We conducted an experiment (800+ +participants, 40+ countries) where participants viewed creative ideas that were +from ChatGPT or prior experimental participants and then brainstormed their own +idea. We varied the number of AI-generated examples (none, low, or high +exposure) and if the examples were labeled as 'AI' (disclosure). Our dynamic +experiment design -- ideas from prior participants in an experimental condition +are used as stimuli for future participants in the same experimental condition +-- speaks to the interdependent process of cultural creation: creative ideas +are built upon prior ideas. Hence, we capture the compounding effects of having +LLMs 'in the culture loop'. We find that high AI exposure (but not low AI +exposure) did not affect the creativity of individual ideas but did increase +the average amount and rate of change of collective idea diversity. AI made +ideas different, not better. There were no main effects of disclosure. We also +found that self-reported creative people were less influenced by knowing an +idea was from AI and that participants may knowingly adopt AI ideas when the +task is difficult. Our findings suggest that introducing AI ideas may increase +collective diversity but not individual creativity. + +
+
+
+
+
+ + ♻ ☆ CATT: Character-based Arabic Tashkeel Transformer + + +
+ Tashkeel, or Arabic Text Diacritization (ATD), greatly enhances the +comprehension of Arabic text by removing ambiguity and minimizing the risk of +misinterpretations caused by its absence. It plays a crucial role in improving +Arabic text processing, particularly in applications such as text-to-speech and +machine translation. This paper introduces a new approach to training ATD +models. First, we finetuned two transformers, encoder-only and encoder-decoder, +that were initialized from a pretrained character-based BERT. Then, we applied +the Noisy-Student approach to boost the performance of the best model. We +evaluated our models alongside 11 commercial and open-source models using two +manually labeled benchmark datasets: WikiNews and our CATT dataset. Our +findings show that our top model surpasses all evaluated models by relative +Diacritic Error Rates (DERs) of 30.83\% and 35.21\% on WikiNews and CATT, +respectively, achieving state-of-the-art in ATD. In addition, we show that our +model outperforms GPT-4-turbo on CATT dataset by a relative DER of 9.36\%. We +open-source our CATT models and benchmark dataset for the research +community\footnote{https://github.com/abjadai/catt}. + +
+
+
+
+
+ + ♻ ☆ Presence or Absence: Are Unknown Word Usages in Dictionaries? + + +
+ There has been a surge of interest in computational modeling of semantic +change. The foci of previous works are on detecting and interpreting word +senses gained over time; however, it remains unclear whether the gained senses +are covered by dictionaries. In this work, we aim to fill this research gap by +comparing detected word senses with dictionary sense inventories in order to +bridge between the communities of lexical semantic change detection and +lexicography. We evaluate our system in the AXOLOTL-24 shared task for Finnish, +Russian and German languages \cite{fedorova-etal-2024-axolotl}. Our system is +fully unsupervised. It leverages a graph-based clustering approach to predict +mappings between unknown word usages and dictionary entries for Subtask 1, and +generates dictionary-like definitions for those novel word usages through the +state-of-the-art Large Language Models such as GPT-4 and LLaMA-3 for Subtask 2. +In Subtask 1, our system outperforms the baseline system by a large margin, and +it offers interpretability for the mapping results by distinguishing between +matched and unmatched (novel) word usages through our graph-based clustering +approach. Our system ranks first in Finnish and German, and ranks second in +Russian on the Subtask 2 test-phase leaderboard. These results show the +potential of our system in managing dictionary entries, particularly for +updating dictionaries to include novel sense entries. Our code and data are +made publicly +available\footnote{\url{https://github.com/xiaohemaikoo/axolotl24-ABDN-NLP}}. + +
+
+ comment: LChange24 Camera Ready +
+
+
+
+
+ + ♻ ☆ Emotion and Intent Joint Understanding in Multimodal Conversation: A + Benchmarking Dataset NeurIPS 2024 + + +
+ Emotion and Intent Joint Understanding in Multimodal Conversation (MC-EIU) +aims to decode the semantic information manifested in a multimodal +conversational history, while inferring the emotions and intents simultaneously +for the current utterance. MC-EIU is enabling technology for many +human-computer interfaces. However, there is a lack of available datasets in +terms of annotation, modality, language diversity, and accessibility. In this +work, we propose an MC-EIU dataset, which features 7 emotion categories, 9 +intent categories, 3 modalities, i.e., textual, acoustic, and visual content, +and two languages, i.e., English and Mandarin. Furthermore, it is completely +open-source for free access. To our knowledge, MC-EIU is the first +comprehensive and rich emotion and intent joint understanding dataset for +multimodal conversation. Together with the release of the dataset, we also +develop an Emotion and Intent Interaction (EI$^2$) network as a reference +system by modeling the deep correlation between emotion and intent in the +multimodal conversation. With comparative experiments and ablation studies, we +demonstrate the effectiveness of the proposed EI$^2$ method on the MC-EIU +dataset. The dataset and codes will be made available at: +https://github.com/MC-EIU/MC-EIU. + +
+
+ comment: 26 pages, 8 figures, 12 tables, NeurIPS 2024 Dataset and Benchmark + Track +
+
+
+
+
+ + ♻ ☆ An Information Bottleneck Perspective for Effective Noise Filtering on + Retrieval-Augmented Generation ACL 2024 + + +
+ Retrieval-augmented generation integrates the capabilities of large language +models with relevant information retrieved from an extensive corpus, yet +encounters challenges when confronted with real-world noisy data. One recent +solution is to train a filter module to find relevant content but only achieve +suboptimal noise compression. In this paper, we propose to introduce the +information bottleneck theory into retrieval-augmented generation. Our approach +involves the filtration of noise by simultaneously maximizing the mutual +information between compression and ground output, while minimizing the mutual +information between compression and retrieved passage. In addition, we derive +the formula of information bottleneck to facilitate its application in novel +comprehensive evaluations, the selection of supervised fine-tuning data, and +the construction of reinforcement learning rewards. Experimental results +demonstrate that our approach achieves significant improvements across various +question answering datasets, not only in terms of the correctness of answer +generation but also in the conciseness with $2.5\%$ compression rate. + +
+
+ comment: Accepted to ACL 2024 +
+
+
+
+
+ + ♻ ☆ NegotiationToM: A Benchmark for Stress-testing Machine Theory of Mind on + Negotiation Surrounding + + +
+ Large Language Models (LLMs) have sparked substantial interest and debate +concerning their potential emergence of Theory of Mind (ToM) ability. Theory of +mind evaluations currently focuses on testing models using machine-generated +data or game settings prone to shortcuts and spurious correlations, which lacks +evaluation of machine ToM ability in real-world human interaction scenarios. +This poses a pressing demand to develop new real-world scenario benchmarks. We +introduce NegotiationToM, a new benchmark designed to stress-test machine ToM +in real-world negotiation surrounding covered multi-dimensional mental states +(i.e., desires, beliefs, and intentions). Our benchmark builds upon the +Belief-Desire-Intention (BDI) agent modeling theory and conducts the necessary +empirical experiments to evaluate large language models. Our findings +demonstrate that NegotiationToM is challenging for state-of-the-art LLMs, as +they consistently perform significantly worse than humans, even when employing +the chain-of-thought (CoT) method. + +
+
+ comment: Dataset: https://github.com/HKUST-KnowComp/NegotiationToM +
+
+
+
+
+ + ♻ ☆ Large Language Models can Share Images, Too! ACL 2024 + + +
+ This paper explores the image-sharing capability of Large Language Models +(LLMs), such as GPT-4 and LLaMA 2, in a zero-shot setting. To facilitate a +comprehensive evaluation of LLMs, we introduce the PhotoChat++ dataset, which +includes enriched annotations (i.e., intent, triggering sentence, image +description, and salient information). Furthermore, we present the +gradient-free and extensible Decide, Describe, and Retrieve (DribeR) framework. +With extensive experiments, we unlock the image-sharing capability of DribeR +equipped with LLMs in zero-shot prompting, with ChatGPT achieving the best +performance. Our findings also reveal the emergent image-sharing ability in +LLMs under zero-shot conditions, validating the effectiveness of DribeR. We use +this framework to demonstrate its practicality and effectiveness in two +real-world scenarios: (1) human-bot interaction and (2) dataset augmentation. +To the best of our knowledge, this is the first study to assess the +image-sharing ability of various LLMs in a zero-shot setting. We make our +source code and dataset publicly available at +https://github.com/passing2961/DribeR. + +
+
+ comment: ACL 2024 Findings; Code is available in + https://github.com/passing2961/DribeR +
+
+
+
+
+ + ♻ ☆ Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey + + +
+ Research surveys have always posed a challenge for beginner researchers who +lack of research training. These researchers struggle to understand the +directions within their research topic, and the discovery of new research +findings within a short time. One way to provide intuitive assistance to +beginner researchers is by offering relevant knowledge graphs(KG) and +recommending related academic papers. However, existing navigation knowledge +graphs primarily rely on keywords in the research field and often fail to +present the logical hierarchy among multiple related papers clearly. Moreover, +most recommendation systems for academic papers simply rely on high text +similarity, which can leave researchers confused as to why a particular article +is being recommended. They may lack of grasp important information about the +insight connection between "Issue resolved" and "Issue finding" that they hope +to obtain. To address these issues, this study aims to support research insight +surveys for beginner researchers by establishing a hierarchical tree-structured +knowledge graph that reflects the inheritance insight of research topics and +the relevance insight among the academic papers. + +
+
+ comment: This paper has been accepted by 'The 18TH International Conference on + INnovations in Intelligent SysTems and Applications (INISTA 2024)' +
+
+
+
+
+ + ♻ ☆ CLIcK: A Benchmark Dataset of Cultural and Linguistic Intelligence in + Korean + + +
+ Despite the rapid development of large language models (LLMs) for the Korean +language, there remains an obvious lack of benchmark datasets that test the +requisite Korean cultural and linguistic knowledge. Because many existing +Korean benchmark datasets are derived from the English counterparts through +translation, they often overlook the different cultural contexts. For the few +benchmark datasets that are sourced from Korean data capturing cultural +knowledge, only narrow tasks such as bias and hate speech detection are +offered. To address this gap, we introduce a benchmark of Cultural and +Linguistic Intelligence in Korean (CLIcK), a dataset comprising 1,995 QA pairs. +CLIcK sources its data from official Korean exams and textbooks, partitioning +the questions into eleven categories under the two main categories of language +and culture. For each instance in CLIcK, we provide fine-grained annotation of +which cultural and linguistic knowledge is required to answer the question +correctly. Using CLIcK, we test 13 language models to assess their performance. +Our evaluation uncovers insights into their performances across the categories, +as well as the diverse factors affecting their comprehension. CLIcK offers the +first large-scale comprehensive Korean-centric analysis of LLMs' proficiency in +Korean culture and language. + +
+
+
+
+
+ + ♻ ☆ Aspect-Based Sentiment Analysis Techniques: A Comparative Study + + +
+ Since the dawn of the digitalisation era, customer feedback and online +reviews are unequivocally major sources of insights for businesses. +Consequently, conducting comparative analyses of such sources has become the de +facto modus operandi of any business that wishes to give itself a competitive +edge over its peers and improve customer loyalty. Sentiment analysis is one +such method instrumental in gauging public interest, exposing market trends, +and analysing competitors. While traditional sentiment analysis focuses on +overall sentiment, as the needs advance with time, it has become important to +explore public opinions and sentiments on various specific subjects, products +and services mentioned in the reviews on a finer-granular level. To this end, +Aspect-based Sentiment Analysis (ABSA), supported by advances in Artificial +Intelligence (AI) techniques which have contributed to a paradigm shift from +simple word-level analysis to tone and context-aware analyses, focuses on +identifying specific aspects within the text and determining the sentiment +associated with each aspect. In this study, we compare several deep-NN methods +for ABSA on two benchmark datasets (Restaurant14 and Laptop-14) and found that +FAST LSA obtains the best overall results of 87.6% and 82.6% accuracy but does +not pass LSA+DeBERTa which reports 90.33% and 86.21% accuracy respectively. + +
+
+
+
+
+ + ♻ ☆ "My Answer is C": First-Token Probabilities Do Not Match Text Answers in + Instruction-Tuned Language Models ACL 2024 + + +
+ The open-ended nature of language generation makes the evaluation of +autoregressive large language models (LLMs) challenging. One common evaluation +approach uses multiple-choice questions (MCQ) to limit the response space. The +model is then evaluated by ranking the candidate answers by the log probability +of the first token prediction. However, first-tokens may not consistently +reflect the final response output, due to model's diverse response styles such +as starting with "Sure" or refusing to answer. Consequently, MCQ evaluation is +not indicative of model behaviour when interacting with users. But by how much? +We evaluate how aligned first-token evaluation is with the text output along +several dimensions, namely final option choice, refusal rate, choice +distribution and robustness under prompt perturbation. Our results show that +the two approaches are severely misaligned on all dimensions, reaching mismatch +rates over 60%. Models heavily fine-tuned on conversational or safety data are +especially impacted. Crucially, models remain misaligned even when we +increasingly constrain prompts, i.e., force them to start with an option letter +or example template. Our findings i) underscore the importance of inspecting +the text output as well and ii) caution against relying solely on first-token +evaluation. + +
+
+ comment: ACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ Is LLM-as-a-Judge Robust? Investigating Universal Adversarial Attacks on + Zero-shot LLM Assessment + + +
+ Large Language Models (LLMs) are powerful zero-shot assessors used in +real-world situations such as assessing written exams and benchmarking systems. +Despite these critical applications, no existing work has analyzed the +vulnerability of judge-LLMs to adversarial manipulation. This work presents the +first study on the adversarial robustness of assessment LLMs, where we +demonstrate that short universal adversarial phrases can be concatenated to +deceive judge LLMs to predict inflated scores. Since adversaries may not know +or have access to the judge-LLMs, we propose a simple surrogate attack where a +surrogate model is first attacked, and the learned attack phrase then +transferred to unknown judge-LLMs. We propose a practical algorithm to +determine the short universal attack phrases and demonstrate that when +transferred to unseen models, scores can be drastically inflated such that +irrespective of the assessed text, maximum scores are predicted. It is found +that judge-LLMs are significantly more susceptible to these adversarial attacks +when used for absolute scoring, as opposed to comparative assessment. Our +findings raise concerns on the reliability of LLM-as-a-judge methods, and +emphasize the importance of addressing vulnerabilities in LLM assessment +methods before deployment in high-stakes real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ In-Context Learning State Vector with Inner and Momentum Optimization + + +
+ Large Language Models (LLMs) have exhibited an impressive ability to perform +In-Context Learning (ICL) from only a few examples. Recent works have indicated +that the functions learned by ICL can be represented through compressed vectors +derived from the transformer. However, the working mechanisms and optimization +of these vectors are yet to be thoroughly explored. In this paper, we address +this gap by presenting a comprehensive analysis of these compressed vectors, +drawing parallels to the parameters trained with gradient descent, and +introduce the concept of state vector. Inspired by the works on model soup and +momentum-based gradient descent, we propose inner and momentum optimization +methods that are applied to refine the state vector progressively as test-time +adaptation. Moreover, we simulate state vector aggregation in the multiple +example setting, where demonstrations comprising numerous examples are usually +too lengthy for regular ICL, and further propose a divide-and-conquer +aggregation method to address this challenge. We conduct extensive experiments +using Llama-2 and GPT-J in both zero-shot setting and few-shot setting. The +experimental results show that our optimization method effectively enhances the +state vector and achieves the state-of-the-art performance on diverse tasks. +Code is available at https://github.com/HITsz-TMG/ICL-State-Vector + +
+
+ comment: 17 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Aligning Neural Machine Translation Models: Human Feedback in Training + and Inference + + +
+ Reinforcement learning from human feedback (RLHF) is a recent technique to +improve the quality of the text generated by a language model, making it closer +to what humans would generate. A core ingredient in RLHF's success in aligning +and improving large language models (LLMs) is its reward model, trained using +human feedback on model outputs. In machine translation (MT), where metrics +trained from human annotations can readily be used as reward models, recent +methods using minimum Bayes risk decoding and reranking have succeeded in +improving the final quality of translation. In this study, we comprehensively +explore and compare techniques for integrating quality metrics as reward models +into the MT pipeline. This includes using the reward model for data filtering, +during the training phase through RL, and at inference time by employing +reranking techniques, and we assess the effects of combining these in a unified +approach. Our experimental results, conducted across multiple translation +tasks, underscore the crucial role of effective data filtering, based on +estimated quality, in harnessing the full potential of RL in enhancing MT +quality. Furthermore, our findings demonstrate the effectiveness of combining +RL training with reranking techniques, showcasing substantial improvements in +translation quality. + +
+
+ comment: EAMT 2024 +
+
+
+
+
+ + ♻ ☆ Using LLMs for the Extraction and Normalization of Product Attribute + Values + + +
+ Product offers on e-commerce websites often consist of a product title and a +textual product description. In order to enable features such as faceted +product search or to generate product comparison tables, it is necessary to +extract structured attribute-value pairs from the unstructured product titles +and descriptions and to normalize the extracted values to a single, unified +scale for each attribute. This paper explores the potential of using large +language models (LLMs), such as GPT-3.5 and GPT-4, to extract and normalize +attribute values from product titles and descriptions. We experiment with +different zero-shot and few-shot prompt templates for instructing LLMs to +extract and normalize attribute-value pairs. We introduce the Web Data Commons +- Product Attribute Value Extraction (WDC-PAVE) benchmark dataset for our +experiments. WDC-PAVE consists of product offers from 59 different websites +which provide schema.org annotations. The offers belong to five different +product categories, each with a specific set of attributes. The dataset +provides manually verified attribute-value pairs in two forms: (i) directly +extracted values and (ii) normalized attribute values. The normalization of the +attribute values requires systems to perform the following types of operations: +name expansion, generalization, unit of measurement conversion, and string +wrangling. Our experiments demonstrate that GPT-4 outperforms the PLM-based +extraction methods SU-OpenTag, AVEQA, and MAVEQA by 10%, achieving an F1-score +of 91%. For the extraction and normalization of product attribute values, GPT-4 +achieves a similar performance to the extraction scenario, while being +particularly strong at string wrangling and name expansion. + +
+
+
+
+
+ + ♻ ☆ CQIL: Inference Latency Optimization with Concurrent Computation of + Quasi-Independent Layers ACL 2024 + + +
+ The fast-growing large scale language models are delivering unprecedented +performance on almost all natural language processing tasks. However, the +effectiveness of large language models are reliant on an exponentially +increasing number of parameters. The overwhelming computation complexity incurs +a high inference latency that negatively affects user experience. Existing +methods to improve inference efficiency, such as tensor parallelism and +quantization, target to reduce per-layer computing latency, yet overlook the +cumulative latency due to the number of layers. Recent works on reducing the +cumulative latency through layer removing, however, lead to significant +performance drop. Motivated by the similarity of inputs among adjacent layers, +we propose to identify quasi-independent layers, which can be concurrently +computed to significantly decrease inference latency. We also introduce a +bypassing technique to mitigate the effect of information loss. Empirical +experiments of the proposed approach on the LLaMA models confirm that +Concurrent Computation of Quasi-Independent Layers (CQIL) can reduce latency by +up to 48.3% on LLaMA-33B, while maintaining a close level of performance. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ When Good and Reproducible Results are a Giant with Feet of Clay: The + Importance of Software Quality in NLP ACL 2024 + + +
+ Despite its crucial role in research experiments, code correctness is often +presumed only on the basis of the perceived quality of results. This assumption +comes with the risk of erroneous outcomes and potentially misleading findings. +To address this issue, we posit that the current focus on reproducibility +should go hand in hand with the emphasis on software quality. We present a case +study in which we identify and fix three bugs in widely used implementations of +the state-of-the-art Conformer architecture. Through experiments on speech +recognition and translation in various languages, we demonstrate that the +presence of bugs does not prevent the achievement of good and reproducible +results, which however can lead to incorrect conclusions that potentially +misguide future research. As a countermeasure, we propose a Code-quality +Checklist and release pangoliNN, a library dedicated to testing neural models, +with the goal of promoting coding best practices and improving research +software quality within the NLP community. + +
+
+ comment: Accepted at ACL 2024 main conference +
+
+
+
+
+ + ♻ ☆ RareBench: Can LLMs Serve as Rare Diseases Specialists? KDD2024 + + +
+ Generalist Large Language Models (LLMs), such as GPT-4, have shown +considerable promise in various domains, including medical diagnosis. Rare +diseases, affecting approximately 300 million people worldwide, often have +unsatisfactory clinical diagnosis rates primarily due to a lack of experienced +physicians and the complexity of differentiating among many rare diseases. In +this context, recent news such as "ChatGPT correctly diagnosed a 4-year-old's +rare disease after 17 doctors failed" underscore LLMs' potential, yet +underexplored, role in clinically diagnosing rare diseases. To bridge this +research gap, we introduce RareBench, a pioneering benchmark designed to +systematically evaluate the capabilities of LLMs on 4 critical dimensions +within the realm of rare diseases. Meanwhile, we have compiled the largest +open-source dataset on rare disease patients, establishing a benchmark for +future studies in this domain. To facilitate differential diagnosis of rare +diseases, we develop a dynamic few-shot prompt methodology, leveraging a +comprehensive rare disease knowledge graph synthesized from multiple knowledge +bases, significantly enhancing LLMs' diagnostic performance. Moreover, we +present an exhaustive comparative study of GPT-4's diagnostic capabilities +against those of specialist physicians. Our experimental findings underscore +the promising potential of integrating LLMs into the clinical diagnostic +process for rare diseases. This paves the way for exciting possibilities in +future advancements in this field. + +
+
+ comment: KDD2024 +
+
+
+
+
+ + ♻ ☆ Is Your AI-Generated Code Really Safe? Evaluating Large Language Models + on Secure Code Generation with CodeSecEval + + +
+ Large language models (LLMs) have brought significant advancements to code +generation and code repair, benefiting both novice and experienced developers. +However, their training using unsanitized data from open-source repositories, +like GitHub, raises the risk of inadvertently propagating security +vulnerabilities. Despite numerous studies investigating the safety of code +LLMs, there remains a gap in comprehensively addressing their security +features. In this work, we aim to present a comprehensive study aimed at +precisely evaluating and enhancing the security aspects of code LLMs. To +support our research, we introduce CodeSecEval, a meticulously curated dataset +designed to address 44 critical vulnerability types with 180 distinct samples. +CodeSecEval serves as the foundation for the automatic evaluation of code +models in two crucial tasks: code generation and code repair, with a strong +emphasis on security. Our experimental results reveal that current models +frequently overlook security issues during both code generation and repair +processes, resulting in the creation of vulnerable code. In response, we +propose different strategies that leverage vulnerability-aware information and +insecure code explanations to mitigate these security vulnerabilities. +Furthermore, our findings highlight that certain vulnerability types +particularly challenge model performance, influencing their effectiveness in +real-world applications. Based on these findings, we believe our study will +have a positive impact on the software engineering community, inspiring the +development of improved methods for training and utilizing LLMs, thereby +leading to safer and more trustworthy model deployment. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2310.16263 +
+
+
+
+
+ + ♻ ☆ Attribute First, then Generate: Locally-attributable Grounded Text + Generation ACL 2024 + + +
+ Recent efforts to address hallucinations in Large Language Models (LLMs) have +focused on attributed text generation, which supplements generated texts with +citations of supporting sources for post-generation fact-checking and +corrections. Yet, these citations often point to entire documents or +paragraphs, burdening users with extensive verification work. In this paper, we +introduce a locally-attributable text generation approach, prioritizing concise +attributions. Our method, named "Attribute First, then Generate", breaks down +the conventional end-to-end generation process into three intuitive steps: +content selection, sentence planning, and sequential sentence generation. By +initially identifying relevant source segments ("select first") and then +conditioning the generation process on them ("then generate"), we ensure these +segments also act as the output's fine-grained attributions ("select" becomes +"attribute"). Tested on Multi-document Summarization and Long-form +Question-answering, our method not only yields more concise citations than the +baselines but also maintains - and in some cases enhances - both generation +quality and attribution accuracy. Furthermore, it significantly reduces the +time required for fact verification by human assessors. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ KG-Rank: Enhancing Large Language Models for Medical QA with Knowledge + Graphs and Ranking Techniques + + +
+ Large language models (LLMs) have demonstrated impressive generative +capabilities with the potential to innovate in medicine. However, the +application of LLMs in real clinical settings remains challenging due to the +lack of factual consistency in the generated content. In this work, we develop +an augmented LLM framework, KG-Rank, which leverages a medical knowledge graph +(KG) along with ranking and re-ranking techniques, to improve the factuality of +long-form question answering (QA) in the medical domain. Specifically, when +receiving a question, KG-Rank automatically identifies medical entities within +the question and retrieves the related triples from the medical KG to gather +factual information. Subsequently, KG-Rank innovatively applies multiple +ranking techniques to refine the ordering of these triples, providing more +relevant and precise information for LLM inference. To the best of our +knowledge, KG-Rank is the first application of KG combined with ranking models +in medical QA specifically for generating long answers. Evaluation on four +selected medical QA datasets demonstrates that KG-Rank achieves an improvement +of over 18% in ROUGE-L score. Additionally, we extend KG-Rank to open domains, +including law, business, music, and history, where it realizes a 14% +improvement in ROUGE-L score, indicating the effectiveness and great potential +of KG-Rank. + +
+
+ comment: 12 pages, 9 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Filtered Direct Preference Optimization + + +
+ Reinforcement learning from human feedback (RLHF) plays a crucial role in +aligning language models with human preferences. While the significance of +dataset quality is generally recognized, explicit investigations into its +impact within the RLHF framework, to our knowledge, have been limited. This +paper addresses the issue of text quality within the preference dataset by +focusing on direct preference optimization (DPO), an increasingly adopted +reward-model-free RLHF method. We confirm that text quality significantly +influences the performance of models optimized with DPO more than those +optimized with reward-model-based RLHF. Building on this new insight, we +propose an extension of DPO, termed filtered direct preference optimization +(fDPO). fDPO uses a trained reward model to monitor the quality of texts within +the preference dataset during DPO training. Samples of lower quality are +discarded based on comparisons with texts generated by the model being +optimized, resulting in a more accurate dataset. Experimental results +demonstrate that fDPO enhances the final model performance. Our code is +available at https://github.com/CyberAgentAILab/filtered-dpo. + +
+
+
+
+
+ + ♻ ☆ Simul-LLM: A Framework for Exploring High-Quality Simultaneous + Translation with Large Language Models ACL 2024 + + +
+ Large language models (LLMs) with billions of parameters and pretrained on +massive amounts of data are now capable of near or better than state-of-the-art +performance in a variety of downstream natural language processing tasks. +Neural machine translation (NMT) is one such task that LLMs have been applied +to with great success. However, little research has focused on applying LLMs to +the more difficult subset of NMT called simultaneous translation (SimulMT), +where translation begins before the entire source context is available to the +model. In this paper, we address key challenges facing LLMs fine-tuned for +SimulMT, validate classical SimulMT concepts and practices in the context of +LLMs, explore adapting LLMs that are fine-tuned for NMT to the task of SimulMT, +and introduce Simul-LLM, the first open-source fine-tuning and evaluation +pipeline development framework for LLMs focused on SimulMT. + +
+
+ comment: ACL 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 27 + +
+
+
+ + ☆ QueryMamba: A Mamba-Based Encoder-Decoder Architecture with a + Statistical Verb-Noun Interaction Module for Video Action Forecasting @ Ego4D + Long-Term Action Anticipation Challenge 2024 + + +
+ This report presents a novel Mamba-based encoder-decoder architecture, +QueryMamba, featuring an integrated verb-noun interaction module that utilizes +a statistical verb-noun co-occurrence matrix to enhance video action +forecasting. This architecture not only predicts verbs and nouns likely to +occur based on historical data but also considers their joint occurrence to +improve forecast accuracy. The efficacy of this approach is substantiated by +experimental results, with the method achieving second place in the Ego4D LTA +challenge and ranking first in noun prediction accuracy. + +
+
+
+
+
+ + ☆ Slice-100K: A Multimodal Dataset for Extrusion-based 3D Printing + + +
+ G-code (Geometric code) or RS-274 is the most widely used computer numerical +control (CNC) and 3D printing programming language. G-code provides machine +instructions for the movement of the 3D printer, especially for the nozzle, +stage, and extrusion of material for extrusion-based additive manufacturing. +Currently there does not exist a large repository of curated CAD models along +with their corresponding G-code files for additive manufacturing. To address +this issue, we present SLICE-100K, a first-of-its-kind dataset of over 100,000 +G-code files, along with their tessellated CAD model, LVIS (Large Vocabulary +Instance Segmentation) categories, geometric properties, and renderings. We +build our dataset from triangulated meshes derived from Objaverse-XL and +Thingi10K datasets. We demonstrate the utility of this dataset by finetuning +GPT-2 on a subset of the dataset for G-code translation from a legacy G-code +format (Sailfish) to a more modern, widely used format (Marlin). SLICE-100K +will be the first step in developing a multimodal foundation model for digital +manufacturing. + +
+
+
+
+
+ + ☆ Attention Normalization Impacts Cardinality Generalization in Slot + Attention + + +
+ Object-centric scene decompositions are important representations for +downstream tasks in fields such as computer vision and robotics. The recently +proposed Slot Attention module, already leveraged by several derivative works +for image segmentation and object tracking in videos, is a deep learning +component which performs unsupervised object-centric scene decomposition on +input images. It is based on an attention architecture, in which latent slot +vectors, which hold compressed information on objects, attend to localized +perceptual features from the input image. In this paper, we show that design +decisions on normalizing the aggregated values in the attention architecture +have considerable impact on the capabilities of Slot Attention to generalize to +a higher number of slots and objects as seen during training. We argue that the +original Slot Attention normalization scheme discards information on the prior +assignment probability of pixels to slots, which impairs its generalization +capabilities. Based on these findings, we propose and investigate alternative +normalization approaches which increase the generalization capabilities of Slot +Attention to varying slot and object counts, resulting in performance gains on +the task of unsupervised image segmentation. + +
+
+ comment: 24 pages, 10 figures, 5 tables +
+
+
+
+
+ + ☆ Solutions to Deepfakes: Can Camera Hardware, Cryptography, and Deep + Learning Verify Real Images? + + +
+ The exponential progress in generative AI poses serious implications for the +credibility of all real images and videos. There will exist a point in the +future where 1) digital content produced by generative AI will be +indistinguishable from those created by cameras, 2) high-quality generative +algorithms will be accessible to anyone, and 3) the ratio of all synthetic to +real images will be large. It is imperative to establish methods that can +separate real data from synthetic data with high confidence. We define real +images as those that were produced by the camera hardware, capturing a +real-world scene. Any synthetic generation of an image or alteration of a real +image through generative AI or computer graphics techniques is labeled as a +synthetic image. To this end, this document aims to: present known strategies +in detection and cryptography that can be employed to verify which images are +real, weight the strengths and weaknesses of these strategies, and suggest +additional improvements to alleviate shortcomings. + +
+
+
+
+
+ + ☆ SineKAN: Kolmogorov-Arnold Networks Using Sinusoidal Activation + Functions + + +
+ Recent work has established an alternative to traditional multi-layer +perceptron neural networks in the form of Kolmogorov-Arnold Networks (KAN). The +general KAN framework uses learnable activation functions on the edges of the +computational graph followed by summation on nodes. The learnable edge +activation functions in the original implementation are basis spline functions +(B-Spline). Here, we present a model in which learnable grids of B-Spline +activation functions can be replaced by grids of re-weighted sine functions. We +show that this leads to better or comparable numerical performance to B-Spline +KAN models on the MNIST benchmark, while also providing a substantial speed +increase on the order of 4-9 times. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Biometric Authentication Based on Enhanced Remote Photoplethysmography + Signal Morphology + + +
+ Remote photoplethysmography (rPPG) is a non-contact method for measuring +cardiac signals from facial videos, offering a convenient alternative to +contact photoplethysmography (cPPG) obtained from contact sensors. Recent +studies have shown that each individual possesses a unique cPPG signal +morphology that can be utilized as a biometric identifier, which has inspired +us to utilize the morphology of rPPG signals extracted from facial videos for +person authentication. Since the facial appearance and rPPG are mixed in the +facial videos, we first de-identify facial videos to remove facial appearance +while preserving the rPPG information, which protects facial privacy and +guarantees that only rPPG is used for authentication. The de-identified videos +are fed into an rPPG model to get the rPPG signal morphology for +authentication. In the first training stage, unsupervised rPPG training is +performed to get coarse rPPG signals. In the second training stage, an +rPPG-cPPG hybrid training is performed by incorporating external cPPG datasets +to achieve rPPG biometric authentication and enhance rPPG signal morphology. +Our approach needs only de-identified facial videos with subject IDs to train +rPPG authentication models. The experimental results demonstrate that rPPG +signal morphology hidden in facial videos can be used for biometric +authentication. The code is available at +https://github.com/zhaodongsun/rppg_biometrics. + +
+
+ comment: accepted by IJCB 2024 +
+
+
+
+
+ + ☆ An Autoencoder Architecture for L-band Passive Microwave Retrieval of + Landscape Freeze-Thaw Cycle + + +
+ Estimating the landscape and soil freeze-thaw (FT) dynamics in the Northern +Hemisphere is crucial for understanding permafrost response to global warming +and changes in regional and global carbon budgets. A new framework is presented +for surface FT-cycle retrievals using L-band microwave radiometry based on a +deep convolutional autoencoder neural network. This framework defines the +landscape FT-cycle retrieval as a time series anomaly detection problem +considering the frozen states as normal and thawed states as anomalies. The +autoencoder retrieves the FT-cycle probabilistically through supervised +reconstruction of the brightness temperature (TB) time series using a +contrastive loss function that minimizes (maximizes) the reconstruction error +for the peak winter (summer). Using the data provided by the Soil Moisture +Active Passive (SMAP) satellite, it is demonstrated that the framework learns +to isolate the landscape FT states over different land surface types with +varying complexities related to the radiometric characteristics of snow cover, +lake-ice phenology, and vegetation canopy. The consistency of the retrievals is +evaluated over Alaska, against in situ ground-based observations, showing +reduced uncertainties compared to the traditional methods that use thresholding +of the normalized polarization ratio. + +
+
+
+
+
+ + ☆ MiniGPT-Med: Large Language Model as a General Interface for Radiology + Diagnosis + + +
+ Recent advancements in artificial intelligence (AI) have precipitated +significant breakthroughs in healthcare, particularly in refining diagnostic +procedures. However, previous studies have often been constrained to limited +functionalities. This study introduces MiniGPT-Med, a vision-language model +derived from large-scale language models and tailored for medical applications. +MiniGPT-Med demonstrates remarkable versatility across various imaging +modalities, including X-rays, CT scans, and MRIs, enhancing its utility. The +model is capable of performing tasks such as medical report generation, visual +question answering (VQA), and disease identification within medical imagery. +Its integrated processing of both image and textual clinical data markedly +improves diagnostic accuracy. Our empirical assessments confirm MiniGPT-Med's +superior performance in disease grounding, medical report generation, and VQA +benchmarks, representing a significant step towards reducing the gap in +assisting radiology practice. Furthermore, it achieves state-of-the-art +performance on medical report generation, higher than the previous best model +by 19\% accuracy. MiniGPT-Med promises to become a general interface for +radiology diagnoses, enhancing diagnostic efficiency across a wide range of +medical imaging applications. + +
+
+
+
+
+ + ☆ Advances in Diffusion Models for Image Data Augmentation: A Review of + Methods, Models, Evaluation Metrics and Future Research Directions + + +
+ Image data augmentation constitutes a critical methodology in modern computer +vision tasks, since it can facilitate towards enhancing the diversity and +quality of training datasets; thereby, improving the performance and robustness +of machine learning models in downstream tasks. In parallel, augmentation +approaches can also be used for editing/modifying a given image in a context- +and semantics-aware way. Diffusion Models (DMs), which comprise one of the most +recent and highly promising classes of methods in the field of generative +Artificial Intelligence (AI), have emerged as a powerful tool for image data +augmentation, capable of generating realistic and diverse images by learning +the underlying data distribution. The current study realizes a systematic, +comprehensive and in-depth review of DM-based approaches for image +augmentation, covering a wide range of strategies, tasks and applications. In +particular, a comprehensive analysis of the fundamental principles, model +architectures and training strategies of DMs is initially performed. +Subsequently, a taxonomy of the relevant image augmentation methods is +introduced, focusing on techniques regarding semantic manipulation, +personalization and adaptation, and application-specific augmentation tasks. +Then, performance assessment methodologies and respective evaluation metrics +are analyzed. Finally, current challenges and future research directions in the +field are discussed. + +
+
+ comment: 53 pages, 15 figures +
+
+
+
+
+ + ☆ C$^3$DG: Conditional Domain Generalization for Hyperspectral Imagery + Classification with Convergence and Constrained-risk Theories + + +
+ Hyperspectral imagery (HSI) classification may suffer the challenge of +hyperspectral-monospectra, where different classes present similar spectra. +Joint spatial-spectral feature extraction is a popular solution for the +problem, but this strategy tends to inflate accuracy since test pixels may +exist in training patches. Domain generalization methods show promising +potential, but they still fail to distinguish similar spectra across varying +domains, in addition, the theoretical support is usually ignored. In this +paper, we only rely on spectral information to solve the +hyperspectral-monospectra problem, and propose a Convergence and +Error-Constrained Conditional Domain Generalization method for Hyperspectral +Imagery Classification (C$^3$DG). The major contributions of this paper include +two aspects: the Conditional Revising Inference Block (CRIB), and the +corresponding theories for model convergence and generalization errors. CRIB is +the kernel structure of the proposed method, which employs a shared encoder and +multi-branch decoders to fully leverage the conditional distribution during +training, achieving a decoupling that aligns with the generation mechanisms of +HSI. Moreover, to ensure model convergence and maintain controllable error, we +propose the optimization convergence theorem and risk upper bound theorem. In +the optimization convergence theorem, we ensure the model convergence by +demonstrating that the gradients of the loss terms are not contradictory. In +the risk upper bound theorem, our theoretical analysis explores the +relationship between test-time training and recent related work to establish a +concrete bound for error. Experimental results on three benchmark datasets +indicate the superiority of C$^3$DG. + +
+
+
+
+
+ + ☆ Looking for Tiny Defects via Forward-Backward Feature Transfer + + +
+ Motivated by efficiency requirements, most anomaly detection and segmentation +(AD&S) methods focus on processing low-resolution images, e.g., $224\times 224$ +pixels, obtained by downsampling the original input images. In this setting, +downsampling is typically applied also to the provided ground-truth defect +masks. Yet, as numerous industrial applications demand identification of both +large and tiny defects, the above-described protocol may fall short in +providing a realistic picture of the actual performance attainable by current +methods. Hence, in this work, we introduce a novel benchmark that evaluates +methods on the original, high-resolution image and ground-truth masks, focusing +on segmentation performance as a function of the size of anomalies. Our +benchmark includes a metric that captures robustness with respect to defect +size, i.e., the ability of a method to preserve good localization from large +anomalies to tiny ones. Furthermore, we introduce an AD&S approach based on a +novel Teacher-Student paradigm which relies on two shallow MLPs (the Students) +that learn to transfer patch features across the layers of a frozen vision +transformer (the Teacher). By means of our benchmark, we evaluate our proposal +and other recent AD&S methods on high-resolution inputs containing large and +tiny defects. Our proposal features the highest robustness to defect size, runs +at the fastest speed, yields state-of-the-art performance on the MVTec AD +dataset and state-of-the-art segmentation performance on the VisA dataset. + +
+
+
+
+
+ + ☆ Certifiably Robust Image Watermark ECCV 2024 + + +
+ Generative AI raises many societal concerns such as boosting disinformation +and propaganda campaigns. Watermarking AI-generated content is a key technology +to address these concerns and has been widely deployed in industry. However, +watermarking is vulnerable to removal attacks and forgery attacks. In this +work, we propose the first image watermarks with certified robustness +guarantees against removal and forgery attacks. Our method leverages randomized +smoothing, a popular technique to build certifiably robust classifiers and +regression models. Our major technical contributions include extending +randomized smoothing to watermarking by considering its unique characteristics, +deriving the certified robustness guarantees, and designing algorithms to +estimate them. Moreover, we extensively evaluate our image watermarks in terms +of both certified and empirical robustness. Our code is available at +\url{https://github.com/zhengyuan-jiang/Watermark-Library}. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ FIPGNet:Pyramid grafting network with feature interaction strategies + + +
+ Salient object detection is designed to identify the objects in an image that +attract the most visual attention.Currently, the most advanced method of +significance object detection adopts pyramid grafting network +architecture.However, pyramid-graft network architecture still has the problem +of failing to accurately locate significant targets.We observe that this is +mainly due to the fact that current salient object detection methods simply +aggregate different scale features, ignoring the correlation between different +scale features.To overcome these problems, we propose a new salience object +detection framework(FIPGNet),which is a pyramid graft network with feature +interaction strategies.Specifically, we propose an attention-mechanism based +feature interaction strategy (FIA) that innovatively introduces spatial agent +Cross Attention (SACA) to achieve multi-level feature interaction, highlighting +important spatial regions from a spatial perspective, thereby enhancing salient +regions.And the channel proxy Cross Attention Module (CCM), which is used to +effectively connect the features extracted by the backbone network and the +features processed using the spatial proxy cross attention module, eliminating +inconsistencies.Finally, under the action of these two modules, the prominent +target location problem in the current pyramid grafting network model is +solved.Experimental results on six challenging datasets show that the proposed +method outperforms the current 12 salient object detection methods on four +indicators. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2309.08365 by other authors +
+
+
+
+
+ + ☆ CLIP-DR: Textual Knowledge-Guided Diabetic Retinopathy Grading with + Ranking-aware Prompting MICCAI 2024 + + +
+ Diabetic retinopathy (DR) is a complication of diabetes and usually takes +decades to reach sight-threatening levels. Accurate and robust detection of DR +severity is critical for the timely management and treatment of diabetes. +However, most current DR grading methods suffer from insufficient robustness to +data variability (\textit{e.g.} colour fundus images), posing a significant +difficulty for accurate and robust grading. In this work, we propose a novel DR +grading framework CLIP-DR based on three observations: 1) Recent pre-trained +visual language models, such as CLIP, showcase a notable capacity for +generalisation across various downstream tasks, serving as effective baseline +models. 2) The grading of image-text pairs for DR often adheres to a +discernible natural sequence, yet most existing DR grading methods have +primarily overlooked this aspect. 3) A long-tailed distribution among DR +severity levels complicates the grading process. This work proposes a novel +ranking-aware prompting strategy to help the CLIP model exploit the ordinal +information. Specifically, we sequentially design learnable prompts between +neighbouring text-image pairs in two different ranking directions. +Additionally, we introduce a Similarity Matrix Smooth module into the structure +of CLIP to balance the class distribution. Finally, we perform extensive +comparisons with several state-of-the-art methods on the GDRBench benchmark, +demonstrating our CLIP-DR's robustness and superior performance. The +implementation code is available +\footnote{\url{https://github.com/Qinkaiyu/CLIP-DR} + +
+
+ comment: Accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ EMPL: A novel Efficient Meta Prompt Learning Framework for Few-shot + Unsupervised Domain Adaptation + + +
+ Few-shot unsupervised domain adaptation (FS-UDA) utilizes few-shot labeled +source domain data to realize effective classification in unlabeled target +domain. However, current FS-UDA methods are still suffer from two issues: 1) +the data from different domains can not be effectively aligned by few-shot +labeled data due to the large domain gaps, 2) it is unstable and time-consuming +to generalize to new FS-UDA tasks.To address this issue, we put forward a novel +Efficient Meta Prompt Learning Framework for FS-UDA. Within this framework, we +use pre-trained CLIP model as the feature learning base model. First, we design +domain-shared prompt learning vectors composed of virtual tokens, which mainly +learns the meta knowledge from a large number of meta tasks to mitigate domain +gaps. Secondly, we also design a task-shared prompt learning network to +adaptively learn specific prompt vectors for each task, which aims to realize +fast adaptation and task generalization. Thirdly, we learn a task-specific +cross-domain alignment projection and a task-specific classifier with +closed-form solutions for each meta task, which can efficiently adapt the model +to new tasks in one step. The whole learning process is formulated as a bilevel +optimization problem, and a good initialization of model parameters is learned +through meta-learning. Extensive experimental study demonstrates the promising +performance of our framework on benchmark datasets. Our method has the large +improvement of at least 15.4% on 5-way 1-shot and 8.7% on 5-way 5-shot, +compared with the state-of-the-art methods. Also, the performance of our method +on all the test tasks is more stable than the other methods. + +
+
+
+
+
+ + ☆ Detect Closer Surfaces that can be Seen: New Modeling and Evaluation in + Cross-domain 3D Object Detection ECAI 2024 + + +
+ The performance of domain adaptation technologies has not yet reached an +ideal level in the current 3D object detection field for autonomous driving, +which is mainly due to significant differences in the size of vehicles, as well +as the environments they operate in when applied across domains. These factors +together hinder the effective transfer and application of knowledge learned +from specific datasets. Since the existing evaluation metrics are initially +designed for evaluation on a single domain by calculating the 2D or 3D overlap +between the prediction and ground-truth bounding boxes, they often suffer from +the overfitting problem caused by the size differences among datasets. This +raises a fundamental question related to the evaluation of the 3D object +detection models' cross-domain performance: Do we really need models to +maintain excellent performance in their original 3D bounding boxes after being +applied across domains? From a practical application perspective, one of our +main focuses is actually on preventing collisions between vehicles and other +obstacles, especially in cross-domain scenarios where correctly predicting the +size of vehicles is much more difficult. In other words, as long as a model can +accurately identify the closest surfaces to the ego vehicle, it is sufficient +to effectively avoid obstacles. In this paper, we propose two metrics to +measure 3D object detection models' ability of detecting the closer surfaces to +the sensor on the ego vehicle, which can be used to evaluate their cross-domain +performance more comprehensively and reasonably. Furthermore, we propose a +refinement head, named EdgeHead, to guide models to focus more on the learnable +closer surfaces, which can greatly improve the cross-domain performance of +existing models not only under our new metrics, but even also under the +original BEV/3D metrics. + +
+
+ comment: Accepted by the 27th European Conference on Artificial Intelligence + (ECAI 2024) +
+
+
+
+
+ + ☆ Occupancy as Set of Points ECCV 2024 + + +
+ In this paper, we explore a novel point representation for 3D occupancy +prediction from multi-view images, which is named Occupancy as Set of Points. +Existing camera-based methods tend to exploit dense volume-based representation +to predict the occupancy of the whole scene, making it hard to focus on the +special areas or areas out of the perception range. In comparison, we present +the Points of Interest (PoIs) to represent the scene and propose OSP, a novel +framework for point-based 3D occupancy prediction. Owing to the inherent +flexibility of the point-based representation, OSP achieves strong performance +compared with existing methods and excels in terms of training and inference +adaptability. It extends beyond traditional perception boundaries and can be +seamlessly integrated with volume-based methods to significantly enhance their +effectiveness. Experiments on the Occ3D nuScenes occupancy benchmark show that +OSP has strong performance and flexibility. Code and models are available at +\url{https://github.com/hustvl/osp}. + +
+
+ comment: Accepted by ECCV 2024. Code and models: https://github.com/hustvl/osp +
+
+
+
+
+ + ☆ Towards Cross-View-Consistent Self-Supervised Surround Depth Estimation + + +
+ Depth estimation is a cornerstone for autonomous driving, yet acquiring +per-pixel depth ground truth for supervised learning is challenging. +Self-Supervised Surround Depth Estimation (SSSDE) from consecutive images +offers an economical alternative. While previous SSSDE methods have proposed +different mechanisms to fuse information across images, few of them explicitly +consider the cross-view constraints, leading to inferior performance, +particularly in overlapping regions. This paper proposes an efficient and +consistent pose estimation design and two loss functions to enhance cross-view +consistency for SSSDE. For pose estimation, we propose to use only front-view +images to reduce training memory and sustain pose estimation consistency. The +first loss function is the dense depth consistency loss, which penalizes the +difference between predicted depths in overlapping regions. The second one is +the multi-view reconstruction consistency loss, which aims to maintain +consistency between reconstruction from spatial and spatial-temporal contexts. +Additionally, we introduce a novel flipping augmentation to improve the +performance further. Our techniques enable a simple neural model to achieve +state-of-the-art performance on the DDAD and nuScenes datasets. Last but not +least, our proposed techniques can be easily applied to other methods. The code +will be made public. + +
+
+
+
+
+ + ☆ Beyond Pixels: Semi-Supervised Semantic Segmentation with a Multi-scale + Patch-based Multi-Label Classifier ECCV24 + + +
+ Incorporating pixel contextual information is critical for accurate +segmentation. In this paper, we show that an effective way to incorporate +contextual information is through a patch-based classifier. This patch +classifier is trained to identify classes present within an image region, which +facilitates the elimination of distractors and enhances the classification of +small object segments. Specifically, we introduce Multi-scale Patch-based +Multi-label Classifier (MPMC), a novel plug-in module designed for existing +semi-supervised segmentation (SSS) frameworks. MPMC offers patch-level +supervision, enabling the discrimination of pixel regions of different classes +within a patch. Furthermore, MPMC learns an adaptive pseudo-label weight, using +patch-level classification to alleviate the impact of the teacher's noisy +pseudo-label supervision the student. This lightweight module can be integrated +into any SSS framework, significantly enhancing their performance. We +demonstrate the efficacy of our proposed MPMC by integrating it into four SSS +methodologies and improving them across two natural image and one medical +segmentation dataset, notably improving the segmentation results of the +baselines across all the three datasets. + +
+
+ comment: to be published in ECCV24 +
+
+
+
+
+ + ☆ Adaptive Step-size Perception Unfolding Network with Non-local Hybrid + Attention for Hyperspectral Image Reconstruction + + +
+ Deep unfolding methods and transformer architecture have recently shown +promising results in hyperspectral image (HSI) reconstruction. However, there +still exist two issues: (1) in the data subproblem, most methods represents the +stepsize utilizing a learnable parameter. Nevertheless, for different spectral +channel, error between features and ground truth is unequal. (2) Transformer +struggles to balance receptive field size with pixel-wise detail information. +To overcome the aforementioned drawbacks, We proposed an adaptive step-size +perception unfolding network (ASPUN), a deep unfolding network based on FISTA +algorithm, which uses an adaptive step-size perception module to estimate the +update step-size of each spectral channel. In addition, we design a Non-local +Hybrid Attention Transformer(NHAT) module for fully leveraging the receptive +field advantage of transformer. By plugging the NLHA into the Non-local +Information Aggregation (NLIA) module, the unfolding network can achieve better +reconstruction results. Experimental results show that our ASPUN is superior to +the existing SOTA algorithms and achieves the best performance. + +
+
+
+
+
+ + ♻ ☆ CNG-SFDA: Clean-and-Noisy Region Guided Online-Offline Source-Free + Domain Adaptation + + +
+ Domain shift occurs when training (source) and test (target) data diverge in +their distribution. Source-Free Domain Adaptation (SFDA) addresses this domain +shift problem, aiming to adopt a trained model on the source domain to the +target domain in a scenario where only a well-trained source model and +unlabeled target data are available. In this scenario, handling false labels in +the target domain is crucial because they negatively impact the model +performance. To deal with this problem, we propose to update cluster prototypes +(i.e., centroid of each sample cluster) and their structure in the target +domain formulated by the source model in online manners. In the feature space, +samples in different regions have different pseudo-label distribution +characteristics affected by the cluster prototypes, and we adopt distinct +training strategies for these samples by defining clean and noisy regions: we +selectively train the target with clean pseudo-labels in the clean region, +whereas we introduce mix-up inputs representing intermediate features between +clean and noisy regions to increase the compactness of the cluster. We +conducted extensive experiments on multiple datasets in online/offline SFDA +settings, whose results demonstrate that our method, CNG-SFDA, achieves +state-of-the-art for most cases. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ ALOHA: from Attention to Likes -- a unified mOdel for understanding + HumAn responses to diverse visual content + + +
+ Progress in human behavior modeling involves understanding both implicit, +early-stage perceptual behavior such as human attention and explicit, +later-stage behavior such as subjective preferences/likes. Yet, most prior +research has focused on modeling implicit and explicit human behavior in +isolation; and often limited to a specific type of visual content. Can we build +a unified model of human attention and preference behavior that works reliably +across diverse types of visual content? Such a model would enable predicting +subjective feedback such as satisfaction or aesthetic quality, along with the +underlying human attention or interaction heatmaps and viewing order, enabling +designers and content-creation models to optimize their creation for +human-centric improvements. In this paper, we propose ALOHA -- a unified model +for understanding human responses from attention to likes, across diverse +visual content. ALOHA leverages a multimodal transformer % featuring distinct +prediction heads for each facet, and predicts different human responses such as +attention heatmaps, scanpath or viewing order, as well as subjective +rating/preference. We train ALOHA on diverse public datasets spanning natural +images, webpages and graphic designs, and achieve SOTA performance on multiple +benchmarks across different image domains and various behavior modeling tasks. +Potential applications include providing instant feedback on the effectiveness +of UIs/designs/images, and serving as a reward model to further optimize +visual-content creation. + +
+
+
+
+
+ + ♻ ☆ Do Pre-trained Models Benefit Equally in Continual Learning? WACV 2023 + + +
+ Existing work on continual learning (CL) is primarily devoted to developing +algorithms for models trained from scratch. Despite their encouraging +performance on contrived benchmarks, these algorithms show dramatic performance +drops in real-world scenarios. Therefore, this paper advocates the systematic +introduction of pre-training to CL, which is a general recipe for transferring +knowledge to downstream tasks but is substantially missing in the CL community. +Our investigation reveals the multifaceted complexity of exploiting pre-trained +models for CL, along three different axes, pre-trained models, CL algorithms, +and CL scenarios. Perhaps most intriguingly, improvements in CL algorithms from +pre-training are very inconsistent an underperforming algorithm could become +competitive and even state-of-the-art when all algorithms start from a +pre-trained model. This indicates that the current paradigm, where all CL +methods are compared in from-scratch training, is not well reflective of the +true CL objective and desired progress. In addition, we make several other +important observations, including that CL algorithms that exert less +regularization benefit more from a pre-trained model; and that a stronger +pre-trained model such as CLIP does not guarantee a better improvement. Based +on these findings, we introduce a simple yet effective baseline that employs +minimum regularization and leverages the more beneficial pre-trained model, +coupled with a two-stage training pipeline. We recommend including this strong +baseline in the future development of CL algorithms, due to its demonstrated +state-of-the-art performance. + +
+
+ comment: Accepted to WACV 2023. Project page: + https://kylee5.web.illinois.edu/publication/WACV23/ +
+
+
+
+
+ + ♻ ☆ SegGen: Supercharging Segmentation Models with Text2Mask and Mask2Img + Synthesis + + +
+ We propose SegGen, a highly-effective training data generation method for +image segmentation, which pushes the performance limits of state-of-the-art +segmentation models to a significant extent. SegGen designs and integrates two +data generation strategies: MaskSyn and ImgSyn. (i) MaskSyn synthesizes new +mask-image pairs via our proposed text-to-mask generation model and +mask-to-image generation model, greatly improving the diversity in segmentation +masks for model supervision; (ii) ImgSyn synthesizes new images based on +existing masks using the mask-to-image generation model, strongly improving +image diversity for model inputs. On the highly competitive ADE20K and COCO +benchmarks, our data generation method markedly improves the performance of +state-of-the-art segmentation models in semantic segmentation, panoptic +segmentation, and instance segmentation. Notably, in terms of the ADE20K mIoU, +Mask2Former R50 is largely boosted from 47.2 to 49.9 (+2.7); Mask2Former Swin-L +is also significantly increased from 56.1 to 57.4 (+1.3). These promising +results strongly suggest the effectiveness of our SegGen even when abundant +human-annotated training data is utilized. Moreover, training with our +synthetic data makes the segmentation models more robust towards unseen +domains. Project website: https://seggenerator.github.io + +
+
+
+
+
+ + ♻ ☆ Improving Efficiency of Diffusion Models via Multi-Stage Framework and + Tailored Multi-Decoder Architectures CVPR + + +
+ Diffusion models, emerging as powerful deep generative tools, excel in +various applications. They operate through a two-steps process: introducing +noise into training samples and then employing a model to convert random noise +into new samples (e.g., images). However, their remarkable generative +performance is hindered by slow training and sampling. This is due to the +necessity of tracking extensive forward and reverse diffusion trajectories, and +employing a large model with numerous parameters across multiple timesteps +(i.e., noise levels). To tackle these challenges, we present a multi-stage +framework inspired by our empirical findings. These observations indicate the +advantages of employing distinct parameters tailored to each timestep while +retaining universal parameters shared across all time steps. Our approach +involves segmenting the time interval into multiple stages where we employ +custom multi-decoder U-net architecture that blends time-dependent models with +a universally shared encoder. Our framework enables the efficient distribution +of computational resources and mitigates inter-stage interference, which +substantially improves training efficiency. Extensive numerical experiments +affirm the effectiveness of our framework, showcasing significant training and +sampling efficiency enhancements on three state-of-the-art diffusion models, +including large-scale latent diffusion models. Furthermore, our ablation +studies illustrate the impact of two important components in our framework: (i) +a novel timestep clustering algorithm for stage division, and (ii) an +innovative multi-decoder U-net architecture, seamlessly integrating universal +and customized hyperparameters. + +
+
+ comment: The IEEE/CVF Conference on Computer Vision and Pattern Recognition + (CVPR) 2024 +
+
+
+
+
+ + ♻ ☆ LEGO: Learning and Graph-Optimized Modular Tracker for Online + Multi-Object Tracking with Point Clouds + + +
+ Online multi-object tracking (MOT) plays a pivotal role in autonomous +systems. The state-of-the-art approaches usually employ a tracking-by-detection +method, and data association plays a critical role. This paper proposes a +learning and graph-optimized (LEGO) modular tracker to improve data association +performance in the existing literature. The proposed LEGO tracker integrates +graph optimization and self-attention mechanisms, which efficiently formulate +the association score map, facilitating the accurate and efficient matching of +objects across time frames. To further enhance the state update process, the +Kalman filter is added to ensure consistent tracking by incorporating temporal +coherence in the object states. Our proposed method utilizing LiDAR alone has +shown exceptional performance compared to other online tracking approaches, +including LiDAR-based and LiDAR-camera fusion-based methods. LEGO ranked 1st at +the time of submitting results to KITTI object tracking evaluation ranking +board and remains 2nd at the time of submitting this paper, among all online +trackers in the KITTI MOT benchmark for cars1 + +
+
+
+
+
+ + ♻ ☆ PhotoBot: Reference-Guided Interactive Photography via Natural Language IROS'24 + + +
+ We introduce PhotoBot, a framework for fully automated photo acquisition +based on an interplay between high-level human language guidance and a robot +photographer. We propose to communicate photography suggestions to the user via +reference images that are selected from a curated gallery. We leverage a visual +language model (VLM) and an object detector to characterize the reference +images via textual descriptions and then use a large language model (LLM) to +retrieve relevant reference images based on a user's language query through +text-based reasoning. To correspond the reference image and the observed scene, +we exploit pre-trained features from a vision transformer capable of capturing +semantic similarity across marked appearance variations. Using these features, +we compute suggested pose adjustments for an RGB-D camera by solving a +perspective-n-point (PnP) problem. We demonstrate our approach using a +manipulator equipped with a wrist camera. Our user studies show that photos +taken by PhotoBot are often more aesthetically pleasing than those taken by +users themselves, as measured by human feedback. We also show that PhotoBot can +generalize to other reference sources such as paintings. + +
+
+ comment: Accepted to the IEEE/RSJ International Conference on Intelligent + Robotics and Systems (IROS'24), Abu Dhabi, UAE, Oct 14-18, 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Leveraging Topic Specificity and Social Relationships for Expert Finding + in Community Question Answering Platforms + + +
+ Online Community Question Answering (CQA) platforms have become indispensable +tools for users seeking expert solutions to their technical queries. The +effectiveness of these platforms relies on their ability to identify and direct +questions to the most knowledgeable users within the community, a process known +as Expert Finding (EF). EF accuracy is crucial for increasing user engagement +and the reliability of provided answers. Despite recent advancements in EF +methodologies, blending the diverse information sources available on CQA +platforms for effective expert identification remains challenging. In this +paper, we present TUEF, a Topic-oriented User-Interaction model for Expert +Finding, which aims to fully and transparently leverage the heterogeneous +information available within online question-answering communities. TUEF +integrates content and social data by constructing a multi-layer graph that +maps out user relationships based on their answering patterns on specific +topics. By combining these sources of information, TUEF identifies the most +relevant and knowledgeable users for any given question and ranks them using +learning-to-rank techniques. Our findings indicate that TUEF's topic-oriented +model significantly enhances performance, particularly in large communities +discussing well-defined topics. Additionally, we show that the interpretable +learning-to-rank algorithm integrated into TUEF offers transparency and +explainability with minimal performance trade-offs. The exhaustive experiments +conducted on six different CQA communities of Stack Exchange show that TUEF +outperforms all competitors with a minimum performance boost of 42.42% in P@1, +32.73% in NDCG@3, 21.76% in R@5, and 29.81% in MRR, excelling in both the +evaluation approaches present in the previous literature. + +
+
+
+
+
+ + ☆ Query-oriented Data Augmentation for Session Search + + +
+ Modeling contextual information in a search session has drawn more and more +attention when understanding complex user intents. Recent methods are all +data-driven, i.e., they train different models on large-scale search log data +to identify the relevance between search contexts and candidate documents. The +common training paradigm is to pair the search context with different candidate +documents and train the model to rank the clicked documents higher than the +unclicked ones. However, this paradigm neglects the symmetric nature of the +relevance between the session context and document, i.e., the clicked documents +can also be paired with different search contexts when training. In this work, +we propose query-oriented data augmentation to enrich search logs and empower +the modeling. We generate supplemental training pairs by altering the most +important part of a search context, i.e., the current query, and train our +model to rank the generated sequence along with the original sequence. This +approach enables models to learn that the relevance of a document may vary as +the session context changes, leading to a better understanding of users' search +patterns. We develop several strategies to alter the current query, resulting +in new training data with varying degrees of difficulty. Through +experimentation on two extensive public search logs, we have successfully +demonstrated the effectiveness of our model. + +
+
+ comment: TKDE 2024 +
+
+
+
+
+ + ☆ Heterogeneous Hypergraph Embedding for Recommendation Systems + + +
+ Recent advancements in recommender systems have focused on integrating +knowledge graphs (KGs) to leverage their auxiliary information. The core idea +of KG-enhanced recommenders is to incorporate rich semantic information for +more accurate recommendations. However, two main challenges persist: i) +Neglecting complex higher-order interactions in the KG-based user-item network, +potentially leading to sub-optimal recommendations, and ii) Dealing with the +heterogeneous modalities of input sources, such as user-item bipartite graphs +and KGs, which may introduce noise and inaccuracies. To address these issues, +we present a novel Knowledge-enhanced Heterogeneous Hypergraph Recommender +System (KHGRec). KHGRec captures group-wise characteristics of both the +interaction network and the KG, modeling complex connections in the KG. Using a +collaborative knowledge heterogeneous hypergraph (CKHG), it employs two +hypergraph encoders to model group-wise interdependencies and ensure +explainability. Additionally, it fuses signals from the input graphs with +cross-view self-supervised learning and attention mechanisms. Extensive +experiments on four real-world datasets show our model's superiority over +various state-of-the-art baselines, with an average 5.18\% relative +improvement. Additional tests on noise resilience, missing data, and cold-start +problems demonstrate the robustness of our KHGRec framework. Our model and +evaluation datasets are publicly available at +\url{https://github.com/viethungvu1998/KHGRec}. + +
+
+
+
+
+ + ☆ Reviewers of Educational Immersive and Extended Reality (XR) + experiences: Who is creating these reviews and why? + + +
+ This paper presents a scoping review of literature to examine who is +reviewing educational immersive or extended reality - eduXR experiences and +why. EduXR experiences in augmented, virtual or mixed reality take many forms, +from supporting manual training, engaging learners in conservation, to provide +opportunities for social connection. For users of eduXR, reviews of an +experience can provide information that helps them determine whether it will +meet their learning needs or not. The source of the review, that is, who they +are and why they have conducted the review, is critical in helping the user +judge the reviews quality and relevance. At present, there is no settled review +system in place for eduXR, though relevant frameworks exist for serious games +review with relevance and overlap for some, but not all, eduXR experiences. +While some authors have engaged in preparing a detailed review structure for +eduXR, there remains a need for a clear and simple way for users of eduXR to +know details about reviewers, e.g., who and why, to help make it easier for +users to identify relevant reviews and gain useful insight about eduXR +experiences. To help address this issue, we conducted a scoping review asking +the question; Who is creating eduXR reviews, and why? We identified 16 papers +that present an academic evaluation on the review process of eduXR reviews. The +16 papers were analysed, coding for who themes and why themes over two separate +cycles, using thematic analysis. An analysis looked to examine what we know +regarding who is providing the reviews, and why, to help us to understand what +enables, inhibits and what is yet unknown about how the eduXR community goes +about making informed choices regarding the eduXR experiences they engage with. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ BM25S: Orders of magnitude faster lexical search via eager sparse + scoring + + +
+ We introduce BM25S, an efficient Python-based implementation of BM25 that +only depends on Numpy and Scipy. BM25S achieves up to a 500x speedup compared +to the most popular Python-based framework by eagerly computing BM25 scores +during indexing and storing them into sparse matrices. It also achieves +considerable speedups compared to highly optimized Java-based implementations, +which are used by popular commercial products. Finally, BM25S reproduces the +exact implementation of five BM25 variants based on Kamphuis et al. (2020) by +extending eager scoring to non-sparse variants using a novel score shifting +method. The code can be found at https://github.com/xhluca/bm25s + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Deep Pareto Reinforcement Learning for Multi-Objective Recommender + System + + +
+ Optimizing multiple objectives simultaneously is an important task in +recommendation platforms to improve their performance on different fronts. +However, this task is particularly challenging since the relationships between +different objectives are heterogeneous across different consumers and +dynamically fluctuating according to different contexts. Especially in those +cases when objectives become conflicting with each other, the result of +recommendations will form a pareto-frontier, where the improvements on any +objective comes at the cost of a performance decrease in another objective. +Unfortunately, existing multi-objective recommender systems do not +systematically consider such relationships; instead, they balance between these +objectives in a static and uniform manner, resulting in performance that is +significantly worse than the pareto-optimality. In this paper, we propose a +Deep Pareto Reinforcement Learning (DeepPRL) approach, where we (1) +comprehensively model the complex relationships between multiple objectives in +recommendations; (2) effectively capture the personalized and contextual +consumer preference towards each objective and update the recommendations +correspondingly; (3) optimize both the short-term and the long-term performance +of multi-objective recommendations. As a result, our method achieves +significant pareto-dominance over state-of-the-art baselines in extensive +offline experiments conducted on three real-world datasets. Furthermore, we +conduct a large-scale online controlled experiment at the video streaming +platform of Alibaba, where our method simultaneously improves the three +conflicting objectives of Click-Through Rate, Video View, and Dwell Time by 2%, +5%, and 7% respectively over the latest production system, demonstrating its +tangible economic impact in industrial applications. + +
+
+
+
+
+ + ☆ When LLM Meets Hypergraph: A Sociological Analysis on Personality via + Online Social Networks + + +
+ Individual personalities significantly influence our perceptions, decisions, +and social interactions, which is particularly crucial for gaining insights +into human behavior patterns in online social network analysis. Many +psychological studies have observed that personalities are strongly reflected +in their social behaviors and social environments. In light of these problems, +this paper proposes a sociological analysis framework for one's personality in +an environment-based view instead of individual-level data mining. +Specifically, to comprehensively understand an individual's behavior from +low-quality records, we leverage the powerful associative ability of LLMs by +designing an effective prompt. In this way, LLMs can integrate various +scattered information with their external knowledge to generate higher-quality +profiles, which can significantly improve the personality analysis performance. +To explore the interactive mechanism behind the users and their online +environments, we design an effective hypergraph neural network where the +hypergraph nodes are users and the hyperedges in the hypergraph are social +environments. We offer a useful dataset with user profile data, personality +traits, and several detected environments from the real-world social platform. +To the best of our knowledge, this is the first network-based dataset +containing both hypergraph structure and social information, which could push +forward future research in this area further. By employing the framework on +this dataset, we can effectively capture the nuances of individual +personalities and their online behaviors, leading to a deeper understanding of +human interactions in the digital world. + +
+
+
+
+
+ + ♻ ☆ Unlocking the Potential of Metaverse in Innovative and Immersive Digital + Health + + +
+ The concept of Metaverse has attracted a lot of attention in various fields +and one of its important applications is health and treatment. The Metaverse +has enormous potential to transform healthcare by changing patient care, +medical education, and the way teaching/learning and research are done. The +purpose of this research is to provide an introduction to the basic concepts +and fundamental technologies of the Metaverse. This paper examines the pros and +cons of the Metaverse in healthcare context and analyzes its potential from the +technology and AI perspective. In particular, the role of machine learning +methods is discussed; We will explain how machine learning algorithms can be +applied to the Metaverse generated data to gain better insights in healthcare +applications. Additionally, we examine the future visions of the Metaverse in +health delivery, by examining emerging technologies such as blockchain and also +addressing privacy concerns. The findings of this study contribute to a deeper +understanding of the applications of Metaverse in healthcare and its potential +to revolutionize the delivery of medical services. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Light-weight End-to-End Graph Interest Network for CTR Prediction in + E-commerce Search + + +
+ Click-through-rate (CTR) prediction has an essential impact on improving user +experience and revenue in e-commerce search. With the development of deep +learning, graph-based methods are well exploited to utilize graph structure +extracted from user behaviors and other information to help embedding learning. +However, most of the previous graph-based methods mainly focus on +recommendation scenarios, and therefore their graph structures highly depend on +item's sequential information from user behaviors, ignoring query's sequential +signal and query-item correlation. In this paper, we propose a new approach +named Light-weight End-to-End Graph Interest Network (EGIN) to effectively mine +users' search interests and tackle previous challenges. (i) EGIN utilizes query +and item's correlation and sequential information from the search system to +build a heterogeneous graph for better CTR prediction in e-commerce search. +(ii) EGIN's graph embedding learning shares the same training input and is +jointly trained with CTR prediction, making the end-to-end framework effortless +to deploy in large-scale search systems. The proposed EGIN is composed of three +parts: query-item heterogeneous graph, light-weight graph sampling, and +multi-interest network. The query-item heterogeneous graph captures correlation +and sequential information of query and item efficiently by the proposed +light-weight graph sampling. The multi-interest network is well designed to +utilize graph embedding to capture various similarity relationships between +query and item to enhance the final CTR prediction. We conduct extensive +experiments on both public and industrial datasets to demonstrate the +effectiveness of the proposed EGIN. At the same time, the training cost of +graph learning is relatively low compared with the main CTR prediction task, +ensuring efficiency in practical applications. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Review of Modern Recommender Systems Using Generative Models + (Gen-RecSys) KDD'24 + + +
+ Traditional recommender systems (RS) typically use user-item rating histories +as their main data source. However, deep generative models now have the +capability to model and sample from complex data distributions, including +user-item interactions, text, images, and videos, enabling novel recommendation +tasks. This comprehensive, multidisciplinary survey connects key advancements +in RS using Generative Models (Gen-RecSys), covering: interaction-driven +generative models; the use of large language models (LLM) and textual data for +natural language recommendation; and the integration of multimodal models for +generating and processing images/videos in RS. Our work highlights necessary +paradigms for evaluating the impact and harm of Gen-RecSys and identifies open +challenges. This survey accompanies a tutorial presented at ACM KDD'24, with +supporting materials provided at: https://encr.pw/vDhLq. + +
+
+ comment: This survey accompanies a tutorial presented at ACM KDD'24 +
+
+
+
+
+ + ♻ ☆ Hyperbolic Knowledge Transfer in Cross-Domain Recommendation System + + +
+ Cross-Domain Recommendation (CDR) seeks to utilize knowledge from different +domains to alleviate the problem of data sparsity in the target recommendation +domain, and it has been gaining more attention in recent years. Although there +have been notable advancements in this area, most current methods represent +users and items in Euclidean space, which is not ideal for handling long-tail +distributed data in recommendation systems. Additionally, adding data from +other domains can worsen the long-tail characteristics of the entire dataset, +making it harder to train CDR models effectively. Recent studies have shown +that hyperbolic methods are particularly suitable for modeling long-tail +distributions, which has led us to explore hyperbolic representations for users +and items in CDR scenarios. However, due to the distinct characteristics of the +different domains, applying hyperbolic representation learning to CDR tasks is +quite challenging. In this paper, we introduce a new framework called +Hyperbolic Contrastive Learning (HCTS), designed to capture the unique features +of each domain while enabling efficient knowledge transfer between domains. We +achieve this by embedding users and items from each domain separately and +mapping them onto distinct hyperbolic manifolds with adjustable curvatures for +prediction. To improve the representations of users and items in the target +domain, we develop a hyperbolic contrastive learning module for knowledge +transfer. Extensive experiments on real-world datasets demonstrate that +hyperbolic manifolds are a promising alternative to Euclidean space for CDR +tasks. + +
+
+
+
+
+ + ♻ ☆ Understanding Biases in ChatGPT-based Recommender Systems: Provider + Fairness, Temporal Stability, and Recency + + +
+ This paper explores the biases in ChatGPT-based recommender systems, focusing +on provider fairness (item-side fairness). Through extensive experiments and +over a thousand API calls, we investigate the impact of prompt design +strategies-including structure, system role, and intent-on evaluation metrics +such as provider fairness, catalog coverage, temporal stability, and recency. +The first experiment examines these strategies in classical top-K +recommendations, while the second evaluates sequential in-context learning +(ICL). + In the first experiment, we assess seven distinct prompt scenarios on top-K +recommendation accuracy and fairness. Accuracy-oriented prompts, like Simple +and Chain-of-Thought (COT), outperform diversification prompts, which, despite +enhancing temporal freshness, reduce accuracy by up to 50%. Embedding fairness +into system roles, such as "act as a fair recommender," proved more effective +than fairness directives within prompts. Diversification prompts led to +recommending newer movies, offering broader genre distribution compared to +traditional collaborative filtering (CF) models. + The second experiment explores sequential ICL, comparing zero-shot and +few-shot ICL. Results indicate that including user demographic information in +prompts affects model biases and stereotypes. However, ICL did not consistently +improve item fairness and catalog coverage over zero-shot learning. Zero-shot +learning achieved higher NDCG and coverage, while ICL-2 showed slight +improvements in hit rate (HR) when age-group context was included. Our study +provides insights into biases of RecLLMs, particularly in provider fairness and +catalog coverage. By examining prompt design, learning strategies, and system +roles, we highlight the potential and challenges of integrating LLMs into +recommendation systems. Further details can be found at +https://github.com/yasdel/Benchmark_RecLLM_Fairness. + +
+
+
+
+
+ + ♻ ☆ CoSPLADE: Contextualizing SPLADE for Conversational Information + Retrieval ECIR 2023 + + +
+ Conversational search is a difficult task as it aims at retrieving documents +based not only on the current user query but also on the full conversation +history. Most of the previous methods have focused on a multi-stage ranking +approach relying on query reformulation, a critical intermediate step that +might lead to a sub-optimal retrieval. Other approaches have tried to use a +fully neural IR first-stage, but are either zero-shot or rely on full +learning-to-rank based on a dataset with pseudo-labels. In this work, +leveraging the CANARD dataset, we propose an innovative lightweight learning +technique to train a first-stage ranker based on SPLADE. By relying on SPLADE +sparse representations, we show that, when combined with a second-stage ranker +based on T5Mono, the results are competitive on the TREC CAsT 2020 and 2021 +tracks. + +
+
+ comment: Accepted at ECIR 2023 +
+
+
+
+
+ + ♻ ☆ Content-Based Image Retrieval for Multi-Class Volumetric Radiology + Images: A Benchmark Study + + +
+ While content-based image retrieval (CBIR) has been extensively studied in +natural image retrieval, its application to medical images presents ongoing +challenges, primarily due to the 3D nature of medical images. Recent studies +have shown the potential use of pre-trained vision embeddings for CBIR in the +context of radiology image retrieval. However, a benchmark for the retrieval of +3D volumetric medical images is still lacking, hindering the ability to +objectively evaluate and compare the efficiency of proposed CBIR approaches in +medical imaging. In this study, we extend previous work and establish a +benchmark for region-based and localized multi-organ retrieval using the +TotalSegmentator dataset (TS) with detailed multi-organ annotations. We +benchmark embeddings derived from pre-trained supervised models on medical +images against embeddings derived from pre-trained unsupervised models on +non-medical images for 29 coarse and 104 detailed anatomical structures in +volume and region levels. For volumetric image retrieval, we adopt a late +interaction re-ranking method inspired by text matching. We compare it against +the original method proposed for volume and region retrieval and achieve a +retrieval recall of 1.0 for diverse anatomical regions with a wide size range. +The findings and methodologies presented in this paper provide insights and +benchmarks for further development and evaluation of CBIR approaches in the +context of medical imaging. + +
+
+ comment: 34 pages, 12 Figures, 22 Tables +
+
+
+
+
+ + ♻ ☆ Towards Statistically Significant Taxonomy Aware Co-location Pattern + Detection + + +
+ Given a collection of Boolean spatial feature types, their instances, a +neighborhood relation (e.g., proximity), and a hierarchical taxonomy of the +feature types, the goal is to find the subsets of feature types or their +parents whose spatial interaction is statistically significant. This problem is +for taxonomy-reliant applications such as ecology (e.g., finding new symbiotic +relationships across the food chain), spatial pathology (e.g., immunotherapy +for cancer), retail, etc. The problem is computationally challenging due to the +exponential number of candidate co-location patterns generated by the taxonomy. +Most approaches for co-location pattern detection overlook the hierarchical +relationships among spatial features, and the statistical significance of the +detected patterns is not always considered, leading to potential false +discoveries. This paper introduces two methods for incorporating taxonomies and +assessing the statistical significance of co-location patterns. The baseline +approach iteratively checks the significance of co-locations between leaf nodes +or their ancestors in the taxonomy. Using the Benjamini-Hochberg procedure, an +advanced approach is proposed to control the false discovery rate. This +approach effectively reduces the risk of false discoveries while maintaining +the power to detect true co-location patterns. Experimental evaluation and case +study results show the effectiveness of the approach. + +
+
+ comment: Accepted in The 16th Conference on Spatial Information Theory (COSIT) + 2024 +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Computational Thinking through Design Patterns in Video Games + + +
+ Prior research has explored potential applications of video games in +programming education to elicit computational thinking skills. However, +existing approaches are often either too general, not taking into account the +diversity of genres and mechanisms between video games, or too narrow, +selecting tools that were specifically designed for educational purposes. In +this paper we propose a more fundamental approach, defining beneficial +connections between individual design patterns present in video games and +computational thinking skills. We argue that video games have the capacity to +elicit these skills and even to potentially train them. This could be an +effective method to solidify a conceptual base which would make programming +education more effective. + +
+
+ comment: draft +
+
+
+
+
+ + ☆ Semantic Grouping Network for Audio Source Separation + + +
+ Recently, audio-visual separation approaches have taken advantage of the +natural synchronization between the two modalities to boost audio source +separation performance. They extracted high-level semantics from visual inputs +as the guidance to help disentangle sound representation for individual +sources. Can we directly learn to disentangle the individual semantics from the +sound itself? The dilemma is that multiple sound sources are mixed together in +the original space. To tackle the difficulty, in this paper, we present a novel +Semantic Grouping Network, termed as SGN, that can directly disentangle sound +representations and extract high-level semantic information for each source +from input audio mixture. Specifically, SGN aggregates category-wise source +features through learnable class tokens of sounds. Then, the aggregated +semantic features can be used as the guidance to separate the corresponding +audio sources from the mixture. We conducted extensive experiments on +music-only and universal sound separation benchmarks: MUSIC, FUSS, MUSDB18, and +VGG-Sound. The results demonstrate that our SGN significantly outperforms +previous audio-only methods and audio-visual models without utilizing +additional visual cues. + +
+
+
+
+
+ + ♻ ☆ MetaDesigner: Advancing Artistic Typography through AI-Driven, + User-Centric, and Multilingual WordArt Synthesis + + +
+ MetaDesigner revolutionizes artistic typography synthesis by leveraging the +strengths of Large Language Models (LLMs) to drive a design paradigm centered +around user engagement. At the core of this framework lies a multi-agent system +comprising the Pipeline, Glyph, and Texture agents, which collectively enable +the creation of customized WordArt, ranging from semantic enhancements to the +imposition of complex textures. MetaDesigner incorporates a comprehensive +feedback mechanism that harnesses insights from multimodal models and user +evaluations to refine and enhance the design process iteratively. Through this +feedback loop, the system adeptly tunes hyperparameters to align with +user-defined stylistic and thematic preferences, generating WordArt that not +only meets but exceeds user expectations of visual appeal and contextual +relevance. Empirical validations highlight MetaDesigner's capability to +effectively serve diverse WordArt applications, consistently producing +aesthetically appealing and context-sensitive results. + +
+
+ comment: 18 pages, 16 figures, Project: + https://modelscope.cn/studios/WordArt/WordArt +
+
+
+
+
+ + ♻ ☆ MultiIoT: Benchmarking Machine Learning for the Internet of Things + + +
+ The next generation of machine learning systems must be adept at perceiving +and interacting with the physical world through a diverse array of sensory +channels. Commonly referred to as the `Internet of Things (IoT)' ecosystem, +sensory data from motion, thermal, geolocation, depth, wireless signals, video, +and audio are increasingly used to model the states of physical environments +and the humans inside them. Despite the potential for understanding human +wellbeing, controlling physical devices, and interconnecting smart cities, the +community has seen limited benchmarks for building machine learning systems for +IoT. Existing efforts are often specialized to a single sensory modality or +prediction task, which makes it difficult to study and train large-scale models +across many IoT sensors and tasks. To accelerate the development of new machine +learning technologies for IoT, this paper proposes MultiIoT, the most expansive +and unified IoT benchmark to date, encompassing over 1.15 million samples from +12 modalities and 8 real-world tasks. MultiIoT introduces unique challenges +involving (1) generalizable learning from many sensory modalities, (2) +multimodal interactions across long temporal ranges, (3) extreme heterogeneity +due to unique structure and noise topologies in real-world sensors, and (4) +complexity during training and inference. We evaluate a comprehensive set of +models on MultiIoT, including modality and task-specific methods, multisensory +and multitask supervised models, and large multisensory foundation models. Our +results highlight opportunities for ML to make a significant impact in IoT, but +many challenges in scalable learning from heterogeneous, long-range, and +imperfect sensory modalities still persist. We release all code and data to +accelerate future research in machine learning for IoT. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 123 + +
+
+
+ + ☆ Planetarium: A Rigorous Benchmark for Translating Text to Structured + Planning Languages + + +
+ Many recent works have explored using language models for planning problems. +One line of research focuses on translating natural language descriptions of +planning tasks into structured planning languages, such as the planning domain +definition language (PDDL). While this approach is promising, accurately +measuring the quality of generated PDDL code continues to pose significant +challenges. First, generated PDDL code is typically evaluated using planning +validators that check whether the problem can be solved with a planner. This +method is insufficient because a language model might generate valid PDDL code +that does not align with the natural language description of the task. Second, +existing evaluation sets often have natural language descriptions of the +planning task that closely resemble the ground truth PDDL, reducing the +challenge of the task. To bridge this gap, we introduce \benchmarkName, a +benchmark designed to evaluate language models' ability to generate PDDL code +from natural language descriptions of planning tasks. We begin by creating a +PDDL equivalence algorithm that rigorously evaluates the correctness of PDDL +code generated by language models by flexibly comparing it against a ground +truth PDDL. Then, we present a dataset of $132,037$ text-to-PDDL pairs across +13 different tasks, with varying levels of difficulty. Finally, we evaluate +several API-access and open-weight language models that reveal this task's +complexity. For example, $87.6\%$ of the PDDL problem descriptions generated by +GPT-4o are syntactically parseable, $82.2\%$ are valid, solve-able problems, +but only $35.1\%$ are semantically correct, highlighting the need for a more +rigorous benchmark for this problem. + +
+
+
+
+
+ + ☆ InternLM-XComposer-2.5: A Versatile Large Vision Language Model + Supporting Long-Contextual Input and Output + + +
+ We present InternLM-XComposer-2.5 (IXC-2.5), a versatile large-vision +language model that supports long-contextual input and output. IXC-2.5 excels +in various text-image comprehension and composition applications, achieving +GPT-4V level capabilities with merely 7B LLM backend. Trained with 24K +interleaved image-text contexts, it can seamlessly extend to 96K long contexts +via RoPE extrapolation. This long-context capability allows IXC-2.5 to excel in +tasks requiring extensive input and output contexts. Compared to its previous +2.0 version, InternLM-XComposer-2.5 features three major upgrades in +vision-language comprehension: (1) Ultra-High Resolution Understanding, (2) +Fine-Grained Video Understanding, and (3) Multi-Turn Multi-Image Dialogue. In +addition to comprehension, IXC-2.5 extends to two compelling applications using +extra LoRA parameters for text-image composition: (1) Crafting Webpages and (2) +Composing High-Quality Text-Image Articles. IXC-2.5 has been evaluated on 28 +benchmarks, outperforming existing open-source state-of-the-art models on 16 +benchmarks. It also surpasses or competes closely with GPT-4V and Gemini Pro on +16 key tasks. The InternLM-XComposer-2.5 is publicly available at +https://github.com/InternLM/InternLM-XComposer. + +
+
+ comment: Technical Report. https://github.com/InternLM/InternLM-XComposer +
+
+
+
+
+ + ☆ BACON: Supercharge Your VLM with Bag-of-Concept Graph to Mitigate + Hallucinations + + +
+ This paper presents Bag-of-Concept Graph (BACON) to gift models with limited +linguistic abilities to taste the privilege of Vision Language Models (VLMs) +and boost downstream tasks such as detection, visual question answering (VQA), +and image generation. Since the visual scenes in physical worlds are structured +with complex relations between objects, BACON breaks down annotations into +basic minimum elements and presents them in a graph structure. Element-wise +style enables easy understanding, and structural composition liberates +difficult locating. Careful prompt design births the BACON captions with the +help of public-available VLMs and segmentation methods. In this way, we gather +a dataset with 100K annotated images, which endow VLMs with remarkable +capabilities, such as accurately generating BACON, transforming prompts into +BACON format, envisioning scenarios in the style of BACONr, and dynamically +modifying elements within BACON through interactive dialogue and more. Wide +representative experiments, including detection, VQA, and image generation +tasks, tell BACON as a lifeline to achieve previous out-of-reach tasks or excel +in their current cutting-edge solutions. + +
+
+
+
+
+ + ☆ A Review of the Applications of Deep Learning-Based Emergent + Communication + + +
+ Emergent communication, or emergent language, is the field of research which +studies how human language-like communication systems emerge de novo in deep +multi-agent reinforcement learning environments. The possibilities of +replicating the emergence of a complex behavior like language have strong +intuitive appeal, yet it is necessary to complement this with clear notions of +how such research can be applicable to other fields of science, technology, and +engineering. This paper comprehensively reviews the applications of emergent +communication research across machine learning, natural language processing, +linguistics, and cognitive science. Each application is illustrated with a +description of its scope, an explication of emergent communication's unique +role in addressing it, a summary of the extant literature working towards the +application, and brief recommendations for near-term research directions. + +
+
+ comment: 49 pages, 15 figures +
+
+
+
+
+ + ☆ LLM Internal States Reveal Hallucination Risk Faced With a Query + + +
+ The hallucination problem of Large Language Models (LLMs) significantly +limits their reliability and trustworthiness. Humans have a self-awareness +process that allows us to recognize what we don't know when faced with queries. +Inspired by this, our paper investigates whether LLMs can estimate their own +hallucination risk before response generation. We analyze the internal +mechanisms of LLMs broadly both in terms of training data sources and across 15 +diverse Natural Language Generation (NLG) tasks, spanning over 700 datasets. +Our empirical analysis reveals two key insights: (1) LLM internal states +indicate whether they have seen the query in training data or not; and (2) LLM +internal states show they are likely to hallucinate or not regarding the query. +Our study explores particular neurons, activation layers, and tokens that play +a crucial role in the LLM perception of uncertainty and hallucination risk. By +a probing estimator, we leverage LLM self-assessment, achieving an average +hallucination estimation accuracy of 84.32\% at run time. + +
+
+
+
+
+ + ☆ Evaluating Automatic Metrics with Incremental Machine Translation + Systems + + +
+ We introduce a dataset comprising commercial machine translations, gathered +weekly over six years across 12 translation directions. Since human A/B testing +is commonly used, we assume commercial systems improve over time, which enables +us to evaluate machine translation (MT) metrics based on their preference for +more recent translations. Our study confirms several previous findings in MT +metrics research and demonstrates the dataset's value as a testbed for metric +evaluation. We release our code at https://github.com/gjwubyron/Evo + +
+
+
+
+
+ + ☆ How Similar Are Elected Politicians and Their Constituents? Quantitative + Evidence From Online Social Network + + +
+ How similar are politicians to those who vote for them? This is a critical +question at the heart of democratic representation and particularly relevant at +times when political dissatisfaction and populism are on the rise. To answer +this question we compare the online discourse of elected politicians and their +constituents. We collect a two and a half years (September 2020 - February +2023) constituency-level dataset for USA and UK that includes: (i) the Twitter +timelines (5.6 Million tweets) of elected political representatives (595 UK +Members of Parliament and 433 USA Representatives), (ii) the Nextdoor posts +(21.8 Million posts) of the constituency (98.4% USA and 91.5% UK +constituencies). We find that elected politicians tend to be equally similar to +their constituents in terms of content and style regardless of whether a +constituency elects a right or left-wing politician. The size of the electoral +victory and the level of income of a constituency shows a nuanced picture. The +narrower the electoral victory, the more similar the style and the more +dissimilar the content is. The lower the income of a constituency, the more +similar the content is. In terms of style, poorer constituencies tend to have a +more similar sentiment and more dissimilar psychological text traits (i.e. +measured with LIWC categories). + +
+
+
+
+
+ + ☆ STF: Sentence Transformer Fine-Tuning For Topic Categorization With + Limited Data + + +
+ Nowadays, topic classification from tweets attracts considerable research +attention. Different classification systems have been suggested thanks to these +research efforts. Nevertheless, they face major challenges owing to low +performance metrics due to the limited amount of labeled data. We propose +Sentence Transformers Fine-tuning (STF), a topic detection system that +leverages pretrained Sentence Transformers models and fine-tuning to classify +topics from tweets accurately. Moreover, extensive parameter sensitivity +analyses were conducted to finetune STF parameters for our topic classification +task to achieve the best performance results. Experiments on two benchmark +datasets demonstrated that (1) the proposed STF can be effectively used for +classifying tweet topics and outperforms the latest state-of-the-art +approaches, and (2) the proposed STF does not require a huge amount of labeled +tweets to achieve good accuracy, which is a limitation of many state-of-the-art +approaches. Our main contribution is the achievement of promising results in +tweet topic classification by applying pretrained sentence transformers +language models. + +
+
+
+
+
+ + ☆ CATT: Character-based Arabic Tashkeel Transformer + + +
+ Tashkeel, or Arabic Text Diacritization (ATD), greatly enhances the +comprehension of Arabic text by removing ambiguity and minimizing the risk of +misinterpretations caused by its absence. It plays a crucial role in improving +Arabic text processing, particularly in applications such as text-to-speech and +machine translation. This paper introduces a new approach to training ATD +models. First, we finetuned two transformers, encoder-only and encoder-decoder, +that were initialized from a pretrained character-based BERT. Then, we applied +the Noisy-Student approach to boost the performance of the best model. We +evaluated our models alongside 11 commercial and open-source models using two +manually labeled benchmark datasets: WikiNews and our CATT dataset. Our +findings show that our top model surpasses all evaluated models by relative +Diacritic Error Rates (DERs) of 30.83\% and 35.21\% on WikiNews and CATT, +respectively, achieving state-of-the-art in ATD. In addition, we show that our +model outperforms GPT-4-turbo on CATT dataset by a relative DER of 9.36\%. We +open-source our CATT models and benchmark dataset for the research +community\footnote{https://github.com/abjadai/catt}. + +
+
+
+
+
+ + ☆ Self-Evaluation as a Defense Against Adversarial Attacks on LLMs + + +
+ When LLMs are deployed in sensitive, human-facing settings, it is crucial +that they do not output unsafe, biased, or privacy-violating outputs. For this +reason, models are both trained and instructed to refuse to answer unsafe +prompts such as "Tell me how to build a bomb." We find that, despite these +safeguards, it is possible to break model defenses simply by appending a space +to the end of a model's input. In a study of eight open-source models, we +demonstrate that this acts as a strong enough attack to cause the majority of +models to generate harmful outputs with very high success rates. We examine the +causes of this behavior, finding that the contexts in which single spaces occur +in tokenized training data encourage models to generate lists when prompted, +overriding training signals to refuse to answer unsafe requests. Our findings +underscore the fragile state of current model alignment and promote the +importance of developing more robust alignment methods. Code and data will be +made available at https://github.com/Linlt-leon/Adversarial-Alignments. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Single Character Perturbations Break LLM Alignment + + +
+ When LLMs are deployed in sensitive, human-facing settings, it is crucial +that they do not output unsafe, biased, or privacy-violating outputs. For this +reason, models are both trained and instructed to refuse to answer unsafe +prompts such as "Tell me how to build a bomb." We find that, despite these +safeguards, it is possible to break model defenses simply by appending a space +to the end of a model's input. In a study of eight open-source models, we +demonstrate that this acts as a strong enough attack to cause the majority of +models to generate harmful outputs with very high success rates. We examine the +causes of this behavior, finding that the contexts in which single spaces occur +in tokenized training data encourage models to generate lists when prompted, +overriding training signals to refuse to answer unsafe requests. Our findings +underscore the fragile state of current model alignment and promote the +importance of developing more robust alignment methods. Code and data will be +available at https://github.com/hannah-aught/space_attack. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Improving Retrieval-augmented Text-to-SQL with AST-based Ranking and + Schema Pruning + + +
+ We focus on Text-to-SQL semantic parsing from the perspective of Large +Language Models. Motivated by challenges related to the size of commercial +database schemata and the deployability of business intelligence solutions, we +propose an approach that dynamically retrieves input database information and +uses abstract syntax trees to select few-shot examples for in-context learning. + Furthermore, we investigate the extent to which an in-parallel semantic +parser can be leveraged for generating $\textit{approximated}$ versions of the +expected SQL queries, to support our retrieval. We take this approach to the +extreme--we adapt a model consisting of less than $500$M parameters, to act as +an extremely efficient approximator, enhancing it with the ability to process +schemata in a parallelised manner. We apply our approach to monolingual and +cross-lingual benchmarks for semantic parsing, showing improvements over +state-of-the-art baselines. Comprehensive experiments highlight the +contribution of modules involved in this retrieval-augmented generation +setting, revealing interesting directions for future work. + +
+
+
+
+
+ + ☆ How Does Quantization Affect Multilingual LLMs? + + +
+ Quantization techniques are widely used to improve inference speed and +deployment of large language models. While a wide body of work examines the +impact of quantized LLMs on English tasks, none have examined the effect of +quantization across languages. We conduct a thorough analysis of quantized +multilingual LLMs, focusing on their performance across languages and at +varying scales. We use automatic benchmarks, LLM-as-a-Judge methods, and human +evaluation, finding that (1) harmful effects of quantization are apparent in +human evaluation, and automatic metrics severely underestimate the detriment: a +1.7% average drop in Japanese across automatic tasks corresponds to a 16.0% +drop reported by human evaluators on realistic prompts; (2) languages are +disparately affected by quantization, with non-Latin script languages impacted +worst; and (3) challenging tasks such as mathematical reasoning degrade +fastest. As the ability to serve low-compute models is critical for wide global +adoption of NLP technologies, our results urge consideration of multilingual +performance as a key evaluation criterion for efficient models. + +
+
+
+
+
+ + ☆ CiteAssist: A System for Automated Preprint Citation and BibTeX + Generation ACL 2024 + + +
+ We present CiteAssist, a system to automate the generation of BibTeX entries +for preprints, streamlining the process of bibliographic annotation. Our system +extracts metadata, such as author names, titles, publication dates, and +keywords, to create standardized annotations within the document. CiteAssist +automatically attaches the BibTeX citation to the end of a PDF and links it on +the first page of the document so other researchers gain immediate access to +the correct citation of the article. This method promotes platform flexibility +by ensuring that annotations remain accessible regardless of the repository +used to publish or access the preprint. The annotations remain available even +if the preprint is viewed externally to CiteAssist. Additionally, the system +adds relevant related papers based on extracted keywords to the preprint, +providing researchers with additional publications besides those in related +work for further reading. Researchers can enhance their preprints organization +and reference management workflows through a free and publicly available web +interface. + +
+
+ comment: Published at SDProc @ ACL 2024 +
+
+
+
+
+ + ☆ Fine-Tuning with Divergent Chains of Thought Boosts Reasoning Through + Self-Correction in Language Models + + +
+ Requiring a Large Language Model to generate intermediary reasoning steps has +been shown to be an effective way of boosting performance. In fact, it has been +found that instruction tuning on these intermediary reasoning steps improves +model performance. In this work, we present a novel method of further improving +performance by requiring models to compare multiple reasoning chains before +generating a solution in a single inference step. We call this method Divergent +CoT (DCoT). We find that instruction tuning on DCoT datasets boosts the +performance of even smaller, and therefore more accessible, LLMs. Through a +rigorous set of experiments spanning a wide range of tasks that require various +reasoning types, we show that fine-tuning on DCoT consistently improves +performance over the CoT baseline across model families and scales (1.3B to +70B). Through a combination of empirical and manual evaluation, we additionally +show that these performance gains stem from models generating multiple +divergent reasoning chains in a single inference step, indicative of the +enabling of self-correction in language models. Our code and data are publicly +available at https://github.com/UKPLab/arxiv2024-divergent-cot. + +
+
+
+
+
+ + ☆ Investigating Decoder-only Large Language Models for Speech-to-text + Translation + + +
+ Large language models (LLMs), known for their exceptional reasoning +capabilities, generalizability, and fluency across diverse domains, present a +promising avenue for enhancing speech-related tasks. In this paper, we focus on +integrating decoder-only LLMs to the task of speech-to-text translation (S2TT). +We propose a decoder-only architecture that enables the LLM to directly consume +the encoded speech representation and generate the text translation. +Additionally, we investigate the effects of different parameter-efficient +fine-tuning techniques and task formulation. Our model achieves +state-of-the-art performance on CoVoST 2 and FLEURS among models trained +without proprietary data. We also conduct analyses to validate the design +choices of our proposed model and bring insights to the integration of LLMs to +S2TT. + +
+
+ comment: Accepted to Interspeech 2024 +
+
+
+
+
+ + ☆ SOS! Soft Prompt Attack Against Open-Source Large Language Models + + +
+ Open-source large language models (LLMs) have become increasingly popular +among both the general public and industry, as they can be customized, +fine-tuned, and freely used. However, some open-source LLMs require approval +before usage, which has led to third parties publishing their own easily +accessible versions. Similarly, third parties have been publishing fine-tuned +or quantized variants of these LLMs. These versions are particularly appealing +to users because of their ease of access and reduced computational resource +demands. This trend has increased the risk of training time attacks, +compromising the integrity and security of LLMs. In this work, we present a new +training time attack, SOS, which is designed to be low in computational demand +and does not require clean data or modification of the model weights, thereby +maintaining the model's utility intact. The attack addresses security issues in +various scenarios, including the backdoor attack, jailbreak attack, and prompt +stealing attack. Our experimental findings demonstrate that the proposed attack +is effective across all evaluated targets. Furthermore, we present the other +side of our SOS technique, namely the copyright token -- a novel technique that +enables users to mark their copyrighted content and prevent models from using +it. + +
+
+
+
+
+ + ☆ Let the Code LLM Edit Itself When You Edit the Code + + +
+ In this work, we investigate a typical scenario in code generation where a +developer edits existing code in real time and requests a code assistant, e.g., +a large language model, to re-predict the next token or next line on the fly. +Naively, the LLM needs to re-encode the entire KV cache to provide an accurate +prediction. However, this process is computationally expensive, especially when +the sequence length is long. Simply encoding the edited subsequence and +integrating it to the original KV cache meets the temporal confusion problem, +leading to significantly worse performance. We address this efficiency and +accuracy trade-off by introducing \underline{\textbf{Positional +\textbf{I}ntegrity \textbf{E}ncoding} (PIE). Building upon the rotary +positional encoding, PIE first removes the rotary matrices in the Key cache +that introduce temporal confusion and then reapplies the correct rotary +matrices. This process ensures that positional relationships between tokens are +correct and requires only a single round of matrix multiplication. We validate +the effectiveness of PIE through extensive experiments on the RepoBench-C-8k +dataset, utilizing DeepSeek-Coder models with 1.3B, 6.7B, and 33B parameters. +Our evaluation includes three real-world coding tasks: code insertion, code +deletion, and multi-place code editing. Results demonstrate that PIE reduces +computational overhead by over 85% compared to the standard full recomputation +approach across all model sizes and tasks while well approximating the model +performance. + +
+
+ comment: Preprint. Work in Progress +
+
+
+
+
+ + ☆ Enhancing Translation Accuracy of Large Language Models through + Continual Pre-Training on Parallel Data + + +
+ In this paper, we propose a two-phase training approach where pre-trained +large language models are continually pre-trained on parallel data and then +supervised fine-tuned with a small amount of high-quality parallel data. To +investigate the effectiveness of our proposed approach, we conducted continual +pre-training with a 3.8B-parameter model and parallel data across eight +different formats. We evaluate these methods on thirteen test sets for +Japanese-to-English and English-to-Japanese translation. The results +demonstrate that when utilizing parallel data in continual pre-training, it is +essential to alternate between source and target sentences. Additionally, we +demonstrated that the translation accuracy improves only for translation +directions where the order of source and target sentences aligns between +continual pre-training data and inference. In addition, we demonstrate that the +LLM-based translation model is more robust in translating spoken language and +achieves higher accuracy with less training data compared to supervised +encoder-decoder models. We also show that the highest accuracy is achieved when +the data for continual pre-training consists of interleaved source and target +sentences and when tags are added to the source sentences. + +
+
+ comment: IWSLT2024, 18 pages +
+
+
+
+
+ + ☆ Speaker- and Text-Independent Estimation of Articulatory Movements and + Phoneme Alignments from Speech + + +
+ This paper introduces a novel combination of two tasks, previously treated +separately: acoustic-to-articulatory speech inversion (AAI) and +phoneme-to-articulatory (PTA) motion estimation. We refer to this joint task as +acoustic phoneme-to-articulatory speech inversion (APTAI) and explore two +different approaches, both working speaker- and text-independently during +inference. We use a multi-task learning setup, with the end-to-end goal of +taking raw speech as input and estimating the corresponding articulatory +movements, phoneme sequence, and phoneme alignment. While both proposed +approaches share these same requirements, they differ in their way of achieving +phoneme-related predictions: one is based on frame classification, the other on +a two-staged training procedure and forced alignment. We reach competitive +performance of 0.73 mean correlation for the AAI task and achieve up to +approximately 87% frame overlap compared to a state-of-the-art text-dependent +phoneme force aligner. + +
+
+ comment: to be published in Interspeech 2024 proceedings +
+
+
+
+
+ + ☆ Social Bias Evaluation for Large Language Models Requires Prompt + Variations + + +
+ Warning: This paper contains examples of stereotypes and biases. Large +Language Models (LLMs) exhibit considerable social biases, and various studies +have tried to evaluate and mitigate these biases accurately. Previous studies +use downstream tasks as prompts to examine the degree of social biases for +evaluation and mitigation. While LLMs' output highly depends on prompts, +previous studies evaluating and mitigating bias have often relied on a limited +variety of prompts. In this paper, we investigate the sensitivity of LLMs when +changing prompt variations (task instruction and prompt, few-shot examples, +debias-prompt) by analyzing task performance and social bias of LLMs. Our +experimental results reveal that LLMs are highly sensitive to prompts to the +extent that the ranking of LLMs fluctuates when comparing models for task +performance and social bias. Additionally, we show that LLMs have tradeoffs +between performance and social bias caused by the prompts. Less bias from +prompt setting may result in reduced performance. Moreover, the ambiguity of +instances is one of the reasons for this sensitivity to prompts in advanced +LLMs, leading to various outputs. We recommend using diverse prompts, as in +this study, to compare the effects of prompts on social bias in LLMs. + +
+
+
+
+
+ + ☆ KeyVideoLLM: Towards Large-scale Video Keyframe Selection + + +
+ Recently, with the rise of web videos, managing and understanding large-scale +video datasets has become increasingly important. Video Large Language Models +(VideoLLMs) have emerged in recent years due to their strong video +understanding capabilities. However, training and inference processes for +VideoLLMs demand vast amounts of data, presenting significant challenges to +data management, particularly regarding efficiency, robustness, and +effectiveness. In this work, we present KeyVideoLLM, a text-video frame +similarity-based keyframe selection method designed to manage VideoLLM data +efficiently, robustly, and effectively. Specifically, KeyVideoLLM achieves a +remarkable data compression rate of up to 60.9 times, substantially lowering +disk space requirements, which proves its high efficiency. Additionally, it +maintains a 100% selection success rate across all video formats and scales, +enhances processing speed by up to 200 times compared to existing keyframe +selection methods, and does not require hyperparameter tuning. Beyond its +outstanding efficiency and robustness, KeyVideoLLM further improves model +performance in video question-answering tasks during both training and +inference stages. Notably, it consistently achieved the state-of-the-art (SoTA) +experimental results on diverse datasets. + +
+
+
+
+
+ + ☆ Cactus: Towards Psychological Counseling Conversations using Cognitive + Behavioral Theory + + +
+ Recently, the demand for psychological counseling has significantly increased +as more individuals express concerns about their mental health. This surge has +accelerated efforts to improve the accessibility of counseling by using large +language models (LLMs) as counselors. To ensure client privacy, training +open-source LLMs faces a key challenge: the absence of realistic counseling +datasets. To address this, we introduce Cactus, a multi-turn dialogue dataset +that emulates real-life interactions using the goal-oriented and structured +approach of Cognitive Behavioral Therapy (CBT). We create a diverse and +realistic dataset by designing clients with varied, specific personas, and +having counselors systematically apply CBT techniques in their interactions. To +assess the quality of our data, we benchmark against established psychological +criteria used to evaluate real counseling sessions, ensuring alignment with +expert evaluations. Experimental results demonstrate that Camel, a model +trained with Cactus, outperforms other models in counseling skills, +highlighting its effectiveness and potential as a counseling agent. We make our +data, model, and code publicly available. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ A Case Study on Context-Aware Neural Machine Translation with Multi-Task + Learning + + +
+ In document-level neural machine translation (DocNMT), multi-encoder +approaches are common in encoding context and source sentences. Recent studies +\cite{li-etal-2020-multi-encoder} have shown that the context encoder generates +noise and makes the model robust to the choice of context. This paper further +investigates this observation by explicitly modelling context encoding through +multi-task learning (MTL) to make the model sensitive to the choice of context. +We conduct experiments on cascade MTL architecture, which consists of one +encoder and two decoders. Generation of the source from the context is +considered an auxiliary task, and generation of the target from the source is +the main task. We experimented with German--English language pairs on News, +TED, and Europarl corpora. Evaluation results show that the proposed MTL +approach performs better than concatenation-based and multi-encoder DocNMT +models in low-resource settings and is sensitive to the choice of context. +However, we observe that the MTL models are failing to generate the source from +the context. These observations align with the previous studies, and this might +suggest that the available document-level parallel corpora are not +context-aware, and a robust sentence-level model can outperform the +context-aware models. + +
+
+ comment: Accepted to EAMT 2024 (poster) +
+
+
+
+
+ + ☆ ALTER: Augmentation for Large-Table-Based Reasoning + + +
+ While extensive research has explored the use of large language models (LLMs) +for table-based reasoning, most approaches struggle with scalability when +applied to large tables. To maintain the superior comprehension abilities of +LLMs in these scenarios, we introduce ALTER(Augmentation for Large-Table-Based +Reasoning)-a framework designed to harness the latent augmentation potential in +both free-form natural language (NL) questions, via the query augmentor, and +semi-structured tabular data, through the table augmentor. By utilizing only a +small subset of relevant data from the table and supplementing it with +pre-augmented schema, semantic, and literal information, ALTER achieves +outstanding performance on table-based reasoning benchmarks. We also provide a +detailed analysis of large-table scenarios, comparing different methods and +various partitioning principles. In these scenarios, our method outperforms all +other approaches and exhibits robustness and efficiency against perturbations. + +
+
+
+
+
+ + ☆ Improving Conversational Abilities of Quantized Large Language Models + via Direct Preference Alignment + + +
+ The rapid advancement of large language models (LLMs) has facilitated their +transformation into conversational chatbots that can grasp contextual nuances +and generate pertinent sentences, closely mirroring human values through +advanced techniques such as instruction tuning and reinforcement learning from +human feedback (RLHF). However, the computational efficiency required for LLMs, +achieved through techniques like post-training quantization (PTQ), presents +challenges such as token-flipping that can impair chatbot performance. In +response, we propose a novel preference alignment approach, quantization-aware +direct preference optimization (QDPO), that aligns quantized LLMs with their +full-precision counterparts, improving conversational abilities. Evaluated on +two instruction-tuned LLMs in various languages, QDPO demonstrated superior +performance in improving conversational abilities compared to established PTQ +and knowledge-distillation fine-tuning techniques, marking a significant step +forward in the development of efficient and effective conversational LLMs. + +
+
+
+
+
+ + ☆ JailbreakHunter: A Visual Analytics Approach for Jailbreak Prompts + Discovery from Large-Scale Human-LLM Conversational Datasets + + +
+ Large Language Models (LLMs) have gained significant attention but also +raised concerns due to the risk of misuse. Jailbreak prompts, a popular type of +adversarial attack towards LLMs, have appeared and constantly evolved to breach +the safety protocols of LLMs. To address this issue, LLMs are regularly updated +with safety patches based on reported jailbreak prompts. However, malicious +users often keep their successful jailbreak prompts private to exploit LLMs. To +uncover these private jailbreak prompts, extensive analysis of large-scale +conversational datasets is necessary to identify prompts that still manage to +bypass the system's defenses. This task is highly challenging due to the +immense volume of conversation data, diverse characteristics of jailbreak +prompts, and their presence in complex multi-turn conversations. To tackle +these challenges, we introduce JailbreakHunter, a visual analytics approach for +identifying jailbreak prompts in large-scale human-LLM conversational datasets. +We have designed a workflow with three analysis levels: group-level, +conversation-level, and turn-level. Group-level analysis enables users to grasp +the distribution of conversations and identify suspicious conversations using +multiple criteria, such as similarity with reported jailbreak prompts in +previous research and attack success rates. Conversation-level analysis +facilitates the understanding of the progress of conversations and helps +discover jailbreak prompts within their conversation contexts. Turn-level +analysis allows users to explore the semantic similarity and token overlap +between a singleturn prompt and the reported jailbreak prompts, aiding in the +identification of new jailbreak strategies. The effectiveness and usability of +the system were verified through multiple case studies and expert interviews. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ☆ Raw Text is All you Need: Knowledge-intensive Multi-turn Instruction + Tuning for Large Language Model + + +
+ Instruction tuning as an effective technique aligns the outputs of large +language models (LLMs) with human preference. But how to generate the seasonal +multi-turn dialogues from raw documents for instruction tuning still requires +further exploration. In this paper, we present a novel framework named R2S that +leverages the CoD-Chain of Dialogue logic to guide large language models (LLMs) +in generating knowledge-intensive multi-turn dialogues for instruction tuning. +By integrating raw documents from both open-source datasets and domain-specific +web-crawled documents into a benchmark K-BENCH, we cover diverse areas such as +Wikipedia (English), Science (Chinese), and Artifacts (Chinese). Our approach +first decides the logic flow of the current dialogue and then prompts LLMs to +produce key phrases for sourcing relevant response content. This methodology +enables the creation of the G I NSTRUCT instruction dataset, retaining raw +document knowledge within dialoguestyle interactions. Utilizing this dataset, +we fine-tune GLLM, a model designed to transform raw documents into structured +multi-turn dialogues, thereby injecting comprehensive domain knowledge into the +SFT model for enhanced instruction tuning. This work signifies a stride towards +refining the adaptability and effectiveness of LLMs in processing and +generating more accurate, contextually nuanced responses across various fields. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ On the Client Preference of LLM Fine-tuning in Federated Learning + + +
+ Reinforcement learning with human feedback (RLHF) fine-tunes a pretrained +large language model (LLM) using preference datasets, enabling the LLM to +generate outputs that align with human preferences. Given the sensitive nature +of these preference datasets held by various clients, there is a need to +implement RLHF within a federated learning (FL) framework, where clients are +reluctant to share their data due to privacy concerns. To address this, we +introduce a feasible framework in which clients collaboratively train a binary +selector with their preference datasets using our proposed FedBis. With a +well-trained selector, we can further enhance the LLM that generates +human-preferred completions. Meanwhile, we propose a novel algorithm, +FedBiscuit, that trains multiple selectors by organizing clients into balanced +and disjoint clusters based on their preferences. Compared to the FedBis, +FedBiscuit demonstrates superior performance in simulating human preferences +for pairwise completions. Our extensive experiments on federated human +preference datasets -- marking the first benchmark to address heterogeneous +data partitioning among clients -- demonstrate that FedBiscuit outperforms +FedBis and even surpasses traditional centralized training. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Strategies for Arabic Readability Modeling ACL + + +
+ Automatic readability assessment is relevant to building NLP applications for +education, content analysis, and accessibility. However, Arabic readability +assessment is a challenging task due to Arabic's morphological richness and +limited readability resources. In this paper, we present a set of experimental +results on Arabic readability assessment using a diverse range of approaches, +from rule-based methods to Arabic pretrained language models. We report our +results on a newly created corpus at different textual granularity levels +(words and sentence fragments). Our results show that combining different +techniques yields the best results, achieving an overall macro F1 score of 86.7 +at the word level and 87.9 at the fragment level on a blind test set. We make +our code, data, and pretrained models publicly available. + +
+
+ comment: Accepted to ArabicNLP 2024, ACL +
+
+
+
+
+ + ☆ Exploiting Dialect Identification in Automatic Dialectal Text + Normalization ACL + + +
+ Dialectal Arabic is the primary spoken language used by native Arabic +speakers in daily communication. The rise of social media platforms has notably +expanded its use as a written language. However, Arabic dialects do not have +standard orthographies. This, combined with the inherent noise in +user-generated content on social media, presents a major challenge to NLP +applications dealing with Dialectal Arabic. In this paper, we explore and +report on the task of CODAfication, which aims to normalize Dialectal Arabic +into the Conventional Orthography for Dialectal Arabic (CODA). We work with a +unique parallel corpus of multiple Arabic dialects focusing on five major city +dialects. We benchmark newly developed pretrained sequence-to-sequence models +on the task of CODAfication. We further show that using dialect identification +information improves the performance across all dialects. We make our code, +data, and pretrained models publicly available. + +
+
+ comment: Accepted to ArabicNLP 2024, ACL +
+
+
+
+
+ + ☆ What Affects the Stability of Tool Learning? An Empirical Study on the + Robustness of Tool Learning Frameworks + + +
+ Tool learning methods have enhanced the ability of large language models +(LLMs) to interact with real-world applications. Many existing works fine-tune +LLMs or design prompts to enable LLMs to select appropriate tools and correctly +invoke them to meet user requirements. However, it is observed in previous +works that the performance of tool learning varies from tasks, datasets, +training settings, and algorithms. Without understanding the impact of these +factors, it can lead to inconsistent results, inefficient model deployment, and +suboptimal tool utilization, ultimately hindering the practical integration and +scalability of LLMs in real-world scenarios. Therefore, in this paper, we +explore the impact of both internal and external factors on the performance of +tool learning frameworks. Through extensive experiments on two benchmark +datasets, we find several insightful conclusions for future work, including the +observation that LLMs can benefit significantly from increased trial and +exploration. We believe our empirical study provides a new perspective for +future tool learning research. + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ☆ Human-like Linguistic Biases in Neural Speech Models: Phonetic + Categorization and Phonotactic Constraints in Wav2Vec2.0 + + +
+ What do deep neural speech models know about phonology? Existing work has +examined the encoding of individual linguistic units such as phonemes in these +models. Here we investigate interactions between units. Inspired by classic +experiments on human speech perception, we study how Wav2Vec2 resolves +phonotactic constraints. We synthesize sounds on an acoustic continuum between +/l/ and /r/ and embed them in controlled contexts where only /l/, only /r/, or +neither occur in English. Like humans, Wav2Vec2 models show a bias towards the +phonotactically admissable category in processing such ambiguous sounds. Using +simple measures to analyze model internals on the level of individual stimuli, +we find that this bias emerges in early layers of the model's Transformer +module. This effect is amplified by ASR finetuning but also present in fully +self-supervised models. Our approach demonstrates how controlled stimulus +designs can help localize specific linguistic knowledge in neural speech +models. + +
+
+ comment: Accepted to Interspeech 2024. For code and materials, see + https://github.com/mdhk/phonotactic-sensitivity +
+
+
+
+
+ + ☆ SemioLLM: Assessing Large Language Models for Semiological Analysis in + Epilepsy Research + + +
+ Large Language Models have shown promising results in their ability to encode +general medical knowledge in standard medical question-answering datasets. +However, their potential application in clinical practice requires evaluation +in domain-specific tasks, where benchmarks are largely missing. In this study +semioLLM, we test the ability of state-of-the-art LLMs (GPT-3.5, GPT-4, Mixtral +8x7B, and Qwen-72chat) to leverage their internal knowledge and reasoning for +epilepsy diagnosis. Specifically, we obtain likelihood estimates linking +unstructured text descriptions of seizures to seizure-generating brain regions, +using an annotated clinical database containing 1269 entries. We evaluate the +LLM's performance, confidence, reasoning, and citation abilities in comparison +to clinical evaluation. Models achieve above-chance classification performance +with prompt engineering significantly improving their outcome, with some models +achieving close-to-clinical performance and reasoning. However, our analyses +also reveal significant pitfalls with several models being overly confident +while showing poor performance, as well as exhibiting citation errors and +hallucinations. In summary, our work provides the first extensive benchmark +comparing current SOTA LLMs in the medical domain of epilepsy and highlights +their ability to leverage unstructured texts from patients' medical history to +aid diagnostic processes in health care. + +
+
+
+
+
+ + ☆ VIVA: A Benchmark for Vision-Grounded Decision-Making with Human Values + + +
+ This paper introduces VIVA, a benchmark for VIsion-grounded decision-making +driven by human VAlues. While most large vision-language models (VLMs) focus on +physical-level skills, our work is the first to examine their multimodal +capabilities in leveraging human values to make decisions under a +vision-depicted situation. VIVA contains 1,062 images depicting diverse +real-world situations and the manually annotated decisions grounded in them. +Given an image there, the model should select the most appropriate action to +address the situation and provide the relevant human values and reason +underlying the decision. Extensive experiments based on VIVA show the +limitation of VLMs in using human values to make multimodal decisions. Further +analyses indicate the potential benefits of exploiting action consequences and +predicted human values. + +
+
+
+
+
+ + ☆ Are Large Language Models Consistent over Value-laden Questions? + + +
+ Large language models (LLMs) appear to bias their survey answers toward +certain values. Nonetheless, some argue that LLMs are too inconsistent to +simulate particular values. Are they? To answer, we first define value +consistency as the similarity of answers across (1) paraphrases of one +question, (2) related questions under one topic, (3) multiple-choice and +open-ended use-cases of one question, and (4) multilingual translations of a +question to English, Chinese, German, and Japanese. We apply these measures to +a few large ($>=34b$), open LLMs including llama-3, as well as gpt-4o, using +eight thousand questions spanning more than 300 topics. Unlike prior work, we +find that models are relatively consistent across paraphrases, use-cases, +translations, and within a topic. Still, some inconsistencies remain. Models +are more consistent on uncontroversial topics (e.g., in the U.S., +"Thanksgiving") than on controversial ones ("euthanasia"). Base models are both +more consistent compared to fine-tuned models and are uniform in their +consistency across topics, while fine-tuned models are more inconsistent about +some topics ("euthanasia") than others ("women's rights") like our human +subjects (n=165). + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ LoRA-Guard: Parameter-Efficient Guardrail Adaptation for Content + Moderation of Large Language Models + + +
+ Guardrails have emerged as an alternative to safety alignment for content +moderation of large language models (LLMs). Existing model-based guardrails +have not been designed for resource-constrained computational portable devices, +such as mobile phones, more and more of which are running LLM-based +applications locally. We introduce LoRA-Guard, a parameter-efficient guardrail +adaptation method that relies on knowledge sharing between LLMs and guardrail +models. LoRA-Guard extracts language features from the LLMs and adapts them for +the content moderation task using low-rank adapters, while a dual-path design +prevents any performance degradation on the generative task. We show that +LoRA-Guard outperforms existing approaches with 100-1000x lower parameter +overhead while maintaining accuracy, enabling on-device content moderation. + +
+
+
+
+
+ + ☆ Mast Kalandar at SemEval-2024 Task 8: On the Trail of Textual Origins: + RoBERTa-BiLSTM Approach to Detect AI-Generated Text SemEval-2024 + + +
+ Large Language Models (LLMs) have showcased impressive abilities in +generating fluent responses to diverse user queries. However, concerns +regarding the potential misuse of such texts in journalism, educational, and +academic contexts have surfaced. SemEval 2024 introduces the task of +Multigenerator, Multidomain, and Multilingual Black-Box Machine-Generated Text +Detection, aiming to develop automated systems for identifying +machine-generated text and detecting potential misuse. In this paper, we i) +propose a RoBERTa-BiLSTM based classifier designed to classify text into two +categories: AI-generated or human ii) conduct a comparative study of our model +with baseline approaches to evaluate its effectiveness. This paper contributes +to the advancement of automatic text detection systems in addressing the +challenges posed by machine-generated text misuse. Our architecture ranked 46th +on the official leaderboard with an accuracy of 80.83 among 125. + +
+
+ comment: SemEval-2024 +
+
+
+
+
+ + ☆ Large Language Models as Evaluators for Scientific Synthesis + + +
+ Our study explores how well the state-of-the-art Large Language Models +(LLMs), like GPT-4 and Mistral, can assess the quality of scientific summaries +or, more fittingly, scientific syntheses, comparing their evaluations to those +of human annotators. We used a dataset of 100 research questions and their +syntheses made by GPT-4 from abstracts of five related papers, checked against +human quality ratings. The study evaluates both the closed-source GPT-4 and the +open-source Mistral model's ability to rate these summaries and provide reasons +for their judgments. Preliminary results show that LLMs can offer logical +explanations that somewhat match the quality ratings, yet a deeper statistical +analysis shows a weak correlation between LLM and human ratings, suggesting the +potential and current limitations of LLMs in scientific synthesis evaluation. + +
+
+ comment: 4 pages, forthcoming as part of the KONVENS 2024 proceedings + https://konvens-2024.univie.ac.at/ +
+
+
+
+
+ + ☆ FSM: A Finite State Machine Based Zero-Shot Prompting Paradigm for + Multi-Hop Question Answering + + +
+ Large Language Models (LLMs) with chain-of-thought (COT) prompting have +demonstrated impressive abilities on simple nature language inference tasks. +However, they tend to perform poorly on Multi-hop Question Answering (MHQA) +tasks due to several challenges, including hallucination, error propagation and +limited context length. We propose a prompting method, Finite State Machine +(FSM) to enhance the reasoning capabilities of LLM for complex tasks in +addition to improved effectiveness and trustworthiness. Different from COT +methods, FSM addresses MHQA by iteratively decomposing a question into +multi-turn sub-questions, and self-correcting in time, improving the accuracy +of answers in each step. Specifically, FSM addresses one sub-question at a time +and decides on the next step based on its current result and state, in an +automaton-like format. Experiments on benchmarks show the effectiveness of our +method. Although our method performs on par with the baseline on relatively +simpler datasets, it excels on challenging datasets like Musique. Moreover, +this approach mitigates the hallucination phenomenon, wherein the correct final +answer can be recovered despite errors in intermediate reasoning. Furthermore, +our method improves LLMs' ability to follow specified output format +requirements, significantly reducing the difficulty of answer interpretation +and the need for reformatting. + +
+
+
+
+
+ + ☆ ObfuscaTune: Obfuscated Offsite Fine-tuning and Inference of Proprietary + LLMs on Private Datasets + + +
+ This work addresses the timely yet underexplored problem of performing +inference and finetuning of a proprietary LLM owned by a model provider entity +on the confidential/private data of another data owner entity, in a way that +ensures the confidentiality of both the model and the data. Hereby, the +finetuning is conducted offsite, i.e., on the computation infrastructure of a +third-party cloud provider. We tackle this problem by proposing ObfuscaTune, a +novel, efficient and fully utility-preserving approach that combines a simple +yet effective obfuscation technique with an efficient usage of confidential +computing (only 5% of the model parameters are placed on TEE). We empirically +demonstrate the effectiveness of ObfuscaTune by validating it on GPT-2 models +with different sizes on four NLP benchmark datasets. Finally, we compare to a +na\"ive version of our approach to highlight the necessity of using random +matrices with low condition numbers in our approach to reduce errors induced by +the obfuscation. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ IncogniText: Privacy-enhancing Conditional Text Anonymization via + LLM-based Private Attribute Randomization + + +
+ In this work, we address the problem of text anonymization where the goal is +to prevent adversaries from correctly inferring private attributes of the +author, while keeping the text utility, i.e., meaning and semantics. We propose +IncogniText, a technique that anonymizes the text to mislead a potential +adversary into predicting a wrong private attribute value. Our empirical +evaluation shows a reduction of private attribute leakage by more than 90%. +Finally, we demonstrate the maturity of IncogniText for real-world applications +by distilling its anonymization capability into a set of LoRA parameters +associated with an on-device model. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ PII-Compass: Guiding LLM training data extraction prompts towards the + target PII via grounding ACL 2024 + + +
+ The latest and most impactful advances in large models stem from their +increased size. Unfortunately, this translates into an improved memorization +capacity, raising data privacy concerns. Specifically, it has been shown that +models can output personal identifiable information (PII) contained in their +training data. However, reported PIII extraction performance varies widely, and +there is no consensus on the optimal methodology to evaluate this risk, +resulting in underestimating realistic adversaries. In this work, we +empirically demonstrate that it is possible to improve the extractability of +PII by over ten-fold by grounding the prefix of the manually constructed +extraction prompt with in-domain data. Our approach, PII-Compass, achieves +phone number extraction rates of 0.92%, 3.9%, and 6.86% with 1, 128, and 2308 +queries, respectively, i.e., the phone number of 1 person in 15 is extractable. + +
+
+ comment: Accepted at ACL 2024 +
+
+
+
+
+ + ☆ Probing the Feasibility of Multilingual Speaker Anonymization + + +
+ In speaker anonymization, speech recordings are modified in a way that the +identity of the speaker remains hidden. While this technology could help to +protect the privacy of individuals around the globe, current research restricts +this by focusing almost exclusively on English data. In this study, we extend a +state-of-the-art anonymization system to nine languages by transforming +language-dependent components to their multilingual counterparts. Experiments +testing the robustness of the anonymized speech against privacy attacks and +speech deterioration show an overall success of this system for all languages. +The results suggest that speaker embeddings trained on English data can be +applied across languages, and that the anonymization performance for a language +is mainly affected by the quality of the speech synthesis component used for +it. + +
+
+ comment: accepted at Interspeech 2024 +
+
+
+
+
+ + ☆ GraCoRe: Benchmarking Graph Comprehension and Complex Reasoning in Large + Language Models + + +
+ Evaluating the graph comprehension and reasoning abilities of Large Language +Models (LLMs) is challenging and often incomplete. Existing benchmarks focus +primarily on pure graph understanding, lacking a comprehensive evaluation +across all graph types and detailed capability definitions. This paper presents +GraCoRe, a benchmark for systematically assessing LLMs' graph comprehension and +reasoning. GraCoRe uses a three-tier hierarchical taxonomy to categorize and +test models on pure graph and heterogeneous graphs, subdividing capabilities +into 10 distinct areas tested through 19 tasks. Our benchmark includes 11 +datasets with 5,140 graphs of varying complexity. We evaluated three +closed-source and seven open-source LLMs, conducting thorough analyses from +both ability and task perspectives. Key findings reveal that semantic +enrichment enhances reasoning performance, node ordering impacts task success, +and the ability to process longer texts does not necessarily improve graph +comprehension or reasoning. GraCoRe is open-sourced at +https://github.com/ZIKEYUAN/GraCoRe + +
+
+
+
+
+ + ☆ Towards Negotiative Dialogue for the Talkamatic Dialogue Manager + + +
+ The paper describes a number of dialogue phenomena associated with +negotiative dialogue, as implemented in a development version of the Talkamatic +Dialogue Manager (TDM). This implementation is an initial step towards full +coverage of general features of negotiative dialogue in TDM. + +
+
+
+
+
+ + ☆ Translatotron-V(ison): An End-to-End Model for In-Image Machine + Translation ACL 2024 + + +
+ In-image machine translation (IIMT) aims to translate an image containing +texts in source language into an image containing translations in target +language. In this regard, conventional cascaded methods suffer from issues such +as error propagation, massive parameters, and difficulties in deployment and +retaining visual characteristics of the input image. Thus, constructing +end-to-end models has become an option, which, however, faces two main +challenges: 1) the huge modeling burden, as it is required to simultaneously +learn alignment across languages and preserve the visual characteristics of the +input image; 2) the difficulties of directly predicting excessively lengthy +pixel sequences. In this paper, we propose \textit{Translatotron-V(ision)}, an +end-to-end IIMT model consisting of four modules. In addition to an image +encoder, and an image decoder, our model contains a target text decoder and an +image tokenizer. Among them, the target text decoder is used to alleviate the +language alignment burden, and the image tokenizer converts long sequences of +pixels into shorter sequences of visual tokens, preventing the model from +focusing on low-level visual features. Besides, we present a two-stage training +framework for our model to assist the model in learning alignment across +modalities and languages. Finally, we propose a location-aware evaluation +metric called Structure-BLEU to assess the translation quality of the generated +images. Experimental results demonstrate that our model achieves competitive +performance compared to cascaded models with only 70.9\% of parameters, and +significantly outperforms the pixel-level end-to-end IIMT model. + +
+
+ comment: Accepted to ACL 2024 Findings +
+
+
+
+
+ + ☆ GPTQT: Quantize Large Language Models Twice to Push the Efficiency + + +
+ Due to their large size, generative Large Language Models (LLMs) require +significant computing and storage resources. This paper introduces a new +post-training quantization method, GPTQT, to reduce memory usage and enhance +processing speed by expressing the weight of LLM in 3bit/2bit. Practice has +shown that minimizing the quantization error of weights is ineffective, leading +to overfitting. Therefore, GPTQT employs a progressive two-step approach: +initially quantizing weights using Linear quantization to a relatively high +bit, followed by converting obtained int weight to lower bit binary coding. A +re-explore strategy is proposed to optimize initial scaling factor. During +inference, these steps are merged into pure binary coding, enabling efficient +computation. Testing across various models and datasets confirms GPTQT's +effectiveness. Compared to the strong 3-bit quantization baseline, GPTQT +further reduces perplexity by 4.01 on opt-66B and increases speed by 1.24 times +on opt-30b. The results on Llama2 show that GPTQT is currently the best binary +coding quantization method for such kind of LLMs. + +
+
+ comment: Accepted by 11th IEEE International Conference on Cybernetics and + Intelligent Systems +
+
+
+
+
+ + ☆ CogErgLLM: Exploring Large Language Model Systems Design Perspective + Using Cognitive Ergonomics ICML 2024 + + +
+ Integrating cognitive ergonomics with LLMs is essential for enhancing safety, +reliability, and user satisfaction in human-AI interactions. Current LLM design +often lacks this integration, leading to systems that may not fully align with +human cognitive capabilities and limitations. Insufficient focus on +incorporating cognitive science methods exacerbates biases in LLM outputs, +while inconsistent application of user-centered design principles results in +sub-optimal user experiences. To address these challenges, our position paper +explores the critical integration of cognitive ergonomics principles into LLM +design, aiming to provide a comprehensive framework and practical guidelines +for ethical LLM development. Through our contributions, we seek to advance +understanding and practice in integrating cognitive ergonomics into LLM +systems, fostering safer, more reliable, and ethically sound human-AI +interactions. + +
+
+ comment: 8 Page, 3 Figures. Accepted to Large Language Models and Cognition @ + ICML 2024 (https://llm-cognition.github.io/#:~:text=CogErgLLM) +
+
+
+
+
+ + ☆ CoIR: A Comprehensive Benchmark for Code Information Retrieval Models + + +
+ Despite the substantial success of Information Retrieval (IR) in various NLP +tasks, most IR systems predominantly handle queries and corpora in natural +language, neglecting the domain of code retrieval. Code retrieval is critically +important yet remains under-explored, with existing methods and benchmarks +inadequately representing the diversity of code in various domains and tasks. +Addressing this gap, we present \textbf{\name} (\textbf{Co}de +\textbf{I}nformation \textbf{R}etrieval Benchmark), a robust and comprehensive +benchmark specifically designed to assess code retrieval capabilities. \name +comprises \textbf{ten} meticulously curated code datasets, spanning +\textbf{eight} distinctive retrieval tasks across \textbf{seven} diverse +domains. We first discuss the construction of \name and its diverse dataset +composition. Further, we evaluate nine widely used retrieval models using +\name, uncovering significant difficulties in performing code retrieval tasks +even with state-of-the-art systems. To facilitate easy adoption and integration +within existing research workflows, \name has been developed as a user-friendly +Python framework, readily installable via pip. It shares same data schema as +other popular benchmarks like MTEB and BEIR, enabling seamless cross-benchmark +evaluations. Through \name, we aim to invigorate research in the code retrieval +domain, providing a versatile benchmarking tool that encourages further +development and exploration of code retrieval systems\footnote{\url{ +https://github.com/CoIR-team/coir}}. + +
+
+
+
+
+ + ☆ Contrast then Memorize: Semantic Neighbor Retrieval-Enhanced Inductive + Multimodal Knowledge Graph Completion SIGIR 2024 + + +
+ A large number of studies have emerged for Multimodal Knowledge Graph +Completion (MKGC) to predict the missing links in MKGs. However, fewer studies +have been proposed to study the inductive MKGC (IMKGC) involving emerging +entities unseen during training. Existing inductive approaches focus on +learning textual entity representations, which neglect rich semantic +information in visual modality. Moreover, they focus on aggregating structural +neighbors from existing KGs, which of emerging entities are usually limited. +However, the semantic neighbors are decoupled from the topology linkage and +usually imply the true target entity. In this paper, we propose the IMKGC task +and a semantic neighbor retrieval-enhanced IMKGC framework CMR, where the +contrast brings the helpful semantic neighbors close, and then the memorize +supports semantic neighbor retrieval to enhance inference. Specifically, we +first propose a unified cross-modal contrastive learning to simultaneously +capture the textual-visual and textual-textual correlations of query-entity +pairs in a unified representation space. The contrastive learning increases the +similarity of positive query-entity pairs, therefore making the representations +of helpful semantic neighbors close. Then, we explicitly memorize the knowledge +representations to support the semantic neighbor retrieval. At test time, we +retrieve the nearest semantic neighbors and interpolate them to the +query-entity similarity distribution to augment the final prediction. Extensive +experiments validate the effectiveness of CMR on three inductive MKGC datasets. +Codes are available at https://github.com/OreOZhao/CMR. + +
+
+ comment: Accepted by SIGIR 2024 +
+
+
+
+
+ + ☆ Safe Unlearning: A Surprisingly Effective and Generalizable Solution to + Defend Against Jailbreak Attacks + + +
+ LLMs are known to be vulnerable to jailbreak attacks, even after safety +alignment. An important observation is that, while different types of jailbreak +attacks can generate significantly different queries, they mostly result in +similar responses that are rooted in the same harmful knowledge (e.g., detailed +steps to make a bomb). Therefore, we conjecture that directly unlearn the +harmful knowledge in the LLM can be a more effective way to defend against +jailbreak attacks than the mainstream supervised fine-tuning (SFT) based +approaches. Our extensive experiments confirmed our insight and suggested +surprising generalizability of our unlearning-based approach: using only 20 raw +harmful questions \emph{without} any jailbreak prompt during training, our +solution reduced the Attack Success Rate (ASR) in Vicuna-7B on +\emph{out-of-distribution} (OOD) harmful questions wrapped with various complex +jailbreak prompts from 82.6\% to 7.7\%. This significantly outperforms +Llama2-7B-Chat, which is fine-tuned on about 0.1M safety alignment samples but +still has an ASR of 21.9\% even under the help of an additional safety system +prompt. Further analysis reveals that the generalization ability of our +solution stems from the intrinsic relatedness among harmful responses across +harmful questions (e.g., response patterns, shared steps and actions, and +similarity among their learned representations in the LLM). Our code is +available at \url{https://github.com/thu-coai/SafeUnlearning}. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Universal Gloss-level Representation for Gloss-free Sign Language + Translation and Production + + +
+ Sign language, essential for the deaf and hard-of-hearing, presents unique +challenges in translation and production due to its multimodal nature and the +inherent ambiguity in mapping sign language motion to spoken language words. +Previous methods often rely on gloss annotations, requiring time-intensive +labor and specialized expertise in sign language. Gloss-free methods have +emerged to address these limitations, but they often depend on external sign +language data or dictionaries, failing to completely eliminate the need for +gloss annotations. There is a clear demand for a comprehensive approach that +can supplant gloss annotations and be utilized for both Sign Language +Translation (SLT) and Sign Language Production (SLP). We introduce Universal +Gloss-level Representation (UniGloR), a unified and self-supervised solution +for both SLT and SLP, trained on multiple datasets including PHOENIX14T, +How2Sign, and NIASL2021. Our results demonstrate UniGloR's effectiveness in the +translation and production tasks. We further report an encouraging result for +the Sign Language Recognition (SLR) on previously unseen data. Our study +suggests that self-supervised learning can be made in a unified manner, paving +the way for innovative and practical applications in future research. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ MindBench: A Comprehensive Benchmark for Mind Map Structure Recognition + and Analysis + + +
+ Multimodal Large Language Models (MLLM) have made significant progress in the +field of document analysis. Despite this, existing benchmarks typically focus +only on extracting text and simple layout information, neglecting the complex +interactions between elements in structured documents such as mind maps and +flowcharts. To address this issue, we introduce the new benchmark named +MindBench, which not only includes meticulously constructed bilingual authentic +or synthetic images, detailed annotations, evaluation metrics and baseline +models, but also specifically designs five types of structured understanding +and parsing tasks. These tasks include full parsing, partial parsing, +position-related parsing, structured Visual Question Answering (VQA), and +position-related VQA, covering key areas such as text recognition, spatial +awareness, relationship discernment, and structured parsing. Extensive +experimental results demonstrate the substantial potential and significant room +for improvement in current models' ability to handle structured document +information. We anticipate that the launch of MindBench will significantly +advance research and application development in structured document analysis +technology. MindBench is available at: +https://miasanlei.github.io/MindBench.github.io/. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ Comparing Feature-based and Context-aware Approaches to PII + Generalization Level Prediction + + +
+ Protecting Personal Identifiable Information (PII) in text data is crucial +for privacy, but current PII generalization methods face challenges such as +uneven data distributions and limited context awareness. To address these +issues, we propose two approaches: a feature-based method using machine +learning to improve performance on structured inputs, and a novel context-aware +framework that considers the broader context and semantic relationships between +the original text and generalized candidates. The context-aware approach +employs Multilingual-BERT for text representation, functional transformations, +and mean squared error scoring to evaluate candidates. Experiments on the +WikiReplace dataset demonstrate the effectiveness of both methods, with the +context-aware approach outperforming the feature-based one across different +scales. This work contributes to advancing PII generalization techniques by +highlighting the importance of feature selection, ensemble learning, and +incorporating contextual information for better privacy protection in text +anonymization. + +
+
+ comment: Accepted to IALP 2024 +
+
+
+
+
+ + ☆ Aspect-Based Sentiment Analysis Techniques: A Comparative Study + + +
+ Since the dawn of the digitalisation era, customer feedback and online +reviews are unequivocally major sources of insights for businesses. +Consequently, conducting comparative analyses of such sources has become the de +facto modus operandi of any business that wishes to give itself a competitive +edge over its peers and improve customer loyalty. Sentiment analysis is one +such method instrumental in gauging public interest, exposing market trends, +and analysing competitors. While traditional sentiment analysis focuses on +overall sentiment, as the needs advance with time, it has become important to +explore public opinions and sentiments on various specific subjects, products +and services mentioned in the reviews on a finer-granular level. To this end, +Aspect-based Sentiment Analysis (ABSA), supported by advances in Artificial +Intelligence (AI) techniques which have contributed to a paradigm shift from +simple word-level analysis to tone and context-aware analyses, focuses on +identifying specific aspects within the text and determining the sentiment +associated with each aspect. In this study, we compare several deep-NN methods +for ABSA on two benchmark datasets (Restaurant14 and Laptop-14) and found that +FAST LSA obtains the best overall results of 87.6% and 82.6% accuracy but does +not pass LSA+DeBERTa which reports 90.33% and 86.21% accuracy respectively. + +
+
+
+
+
+ + ☆ LANE: Logic Alignment of Non-tuning Large Language Models and Online + Recommendation Systems for Explainable Reason Generation + + +
+ The explainability of recommendation systems is crucial for enhancing user +trust and satisfaction. Leveraging large language models (LLMs) offers new +opportunities for comprehensive recommendation logic generation. However, in +existing related studies, fine-tuning LLM models for recommendation tasks +incurs high computational costs and alignment issues with existing systems, +limiting the application potential of proven proprietary/closed-source LLM +models, such as GPT-4. In this work, our proposed effective strategy LANE +aligns LLMs with online recommendation systems without additional LLMs tuning, +reducing costs and improving explainability. This innovative approach addresses +key challenges in integrating language models with recommendation systems while +fully utilizing the capabilities of powerful proprietary models. Specifically, +our strategy operates through several key components: semantic embedding, user +multi-preference extraction using zero-shot prompting, semantic alignment, and +explainable recommendation generation using Chain of Thought (CoT) prompting. +By embedding item titles instead of IDs and utilizing multi-head attention +mechanisms, our approach aligns the semantic features of user preferences with +those of candidate items, ensuring coherent and user-aligned recommendations. +Sufficient experimental results including performance comparison, questionnaire +voting, and visualization cases prove that our method can not only ensure +recommendation performance, but also provide easy-to-understand and reasonable +recommendation logic. + +
+
+
+
+
+ + ☆ Investigating the Contextualised Word Embedding Dimensions Responsible + for Contextual and Temporal Semantic Changes + + +
+ Words change their meaning over time as well as in different contexts. The +sense-aware contextualised word embeddings (SCWEs) such as the ones produced by +XL-LEXEME by fine-tuning masked langauge models (MLMs) on Word-in-Context (WiC) +data attempt to encode such semantic changes of words within the contextualised +word embedding (CWE) spaces. Despite the superior performance of SCWEs in +contextual/temporal semantic change detection (SCD) benchmarks, it remains +unclear as to how the meaning changes are encoded in the embedding space. To +study this, we compare pre-trained CWEs and their fine-tuned versions on +contextual and temporal semantic change benchmarks under Principal Component +Analysis (PCA) and Independent Component Analysis (ICA) transformations. Our +experimental results reveal several novel insights such as (a) although there +exist a smaller number of axes that are responsible for semantic changes of +words in the pre-trained CWE space, this information gets distributed across +all dimensions when fine-tuned, and (b) in contrast to prior work studying the +geometry of CWEs, we find that PCA to better represent semantic changes than +ICA. Source code is available at https://github.com/LivNLP/svp-dims . + +
+
+
+
+
+ + ☆ Efficient Training of Language Models with Compact and Consistent Next + Token Distributions ACL 2024 + + +
+ Maximizing the likelihood of the next token is an established, statistically +sound objective for pre-training language models. In this paper we show that we +can train better models faster by pre-aggregating the corpus with a collapsed +$n$-gram distribution. Previous studies have proposed corpus-level $n$-gram +statistics as a regularizer; however, the construction and querying of such +$n$-grams, if done naively, prove to be costly and significantly impede +training speed, thereby limiting their application in modern large language +model pre-training. + We introduce an alternative compact representation of the next token +distribution that, in expectation, aligns with the complete $n$-gram +distribution while markedly reducing variance across mini-batches compared to +the standard next-token loss. Empirically, we demonstrate that both the +$n$-gram regularized model and our approximation yield substantial improvements +in model quality and convergence rate compared to existing methods. +Furthermore, our approximation facilitates scalability of gains to larger +datasets and models compared to the straightforward $n$-gram regularization +method. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ☆ Images Speak Louder than Words: Understanding and Mitigating Bias in + Vision-Language Model from a Causal Mediation Perspective + + +
+ Vision-language models (VLMs) pre-trained on extensive datasets can +inadvertently learn biases by correlating gender information with specific +objects or scenarios. Current methods, which focus on modifying inputs and +monitoring changes in the model's output probability scores, often struggle to +comprehensively understand bias from the perspective of model components. We +propose a framework that incorporates causal mediation analysis to measure and +map the pathways of bias generation and propagation within VLMs. This approach +allows us to identify the direct effects of interventions on model bias and the +indirect effects of interventions on bias mediated through different model +components. Our results show that image features are the primary contributors +to bias, with significantly higher impacts than text features, specifically +accounting for 32.57% and 12.63% of the bias in the MSCOCO and PASCAL-SENTENCE +datasets, respectively. Notably, the image encoder's contribution surpasses +that of the text encoder and the deep fusion encoder. Further experimentation +confirms that contributions from both language and vision modalities are +aligned and non-conflicting. Consequently, focusing on blurring gender +representations within the image encoder, which contributes most to the model +bias, reduces bias efficiently by 22.03% and 9.04% in the MSCOCO and +PASCAL-SENTENCE datasets, respectively, with minimal performance loss or +increased computational demands. + +
+
+
+
+
+ + ☆ 52B to 1T: Lessons Learned via Tele-FLM Series + + +
+ Large Language Models (LLMs) represent a significant stride toward Artificial +General Intelligence. As scaling laws underscore the potential of increasing +model sizes, the academic community has intensified its investigations into +LLMs with capacities exceeding 50 billion parameters. This technical report +builds on our prior work with Tele-FLM (also known as FLM-2), a publicly +available 52-billion-parameter model. We delve into two primary areas: we first +discuss our observation of Supervised Fine-tuning (SFT) on Tele-FLM-52B, which +supports the "less is more" approach for SFT data construction; second, we +demonstrate our experiments and analyses on the best practices for +progressively growing a model from 52 billion to 102 billion, and subsequently +to 1 trillion parameters. We will open-source a 1T model checkpoint, namely +Tele-FLM-1T, to advance further training and research. + +
+
+ comment: For the Tele-FLM-52B tech report, see also 2404.16645 +
+
+
+
+
+ + ☆ A Framework for Quantum Finite-State Languages with Density Mapping + + +
+ A quantum finite-state automaton (QFA) is a theoretical model designed to +simulate the evolution of a quantum system with finite memory in response to +sequential input strings. We define the language of a QFA as the set of strings +that lead the QFA to an accepting state when processed from its initial state. +QFAs exemplify how quantum computing can achieve greater efficiency compared to +classical computing. While being one of the simplest quantum models, QFAs are +still notably challenging to construct from scratch due to the preliminary +knowledge of quantum mechanics required for superimposing unitary constraints +on the automata. Furthermore, even when QFAs are correctly assembled, the +limitations of a current quantum computer may cause fluctuations in the +simulation results depending on how an assembled QFA is translated into a +quantum circuit. + We present a framework that provides a simple and intuitive way to build QFAs +and maximize the simulation accuracy. Our framework relies on two methods: +First, it offers a predefined construction for foundational types of QFAs that +recognize special languages MOD and EQU. They play a role of basic building +blocks for more complex QFAs. In other words, one can obtain more complex QFAs +from these foundational automata using standard language operations. Second, we +improve the simulation accuracy by converting these QFAs into quantum circuits +such that the resulting circuits perform well on noisy quantum computers. + Our framework is available at https://github.com/sybaik1/qfa-toolkit. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ MLKD-BERT: Multi-level Knowledge Distillation for Pre-trained Language + Models + + +
+ Knowledge distillation is an effective technique for pre-trained language +model compression. Although existing knowledge distillation methods perform +well for the most typical model BERT, they could be further improved in two +aspects: the relation-level knowledge could be further explored to improve +model performance; and the setting of student attention head number could be +more flexible to decrease inference time. Therefore, we are motivated to +propose a novel knowledge distillation method MLKD-BERT to distill multi-level +knowledge in teacher-student framework. Extensive experiments on GLUE benchmark +and extractive question answering tasks demonstrate that our method outperforms +state-of-the-art knowledge distillation methods on BERT. In addition, MLKD-BERT +can flexibly set student attention head number, allowing for substantial +inference time decrease with little performance drop. + +
+
+
+
+
+ + ☆ Automatic gradient descent with generalized Newton's method + + +
+ We propose the generalized Newton's method (GeN) -- a Hessian-informed +approach that applies to any optimizer such as SGD and Adam, and covers the +Newton-Raphson method as a sub-case. Our method automatically and dynamically +selects the learning rate that accelerates the convergence, without the +intensive tuning of the learning rate scheduler. In practice, out method is +easily implementable, since it only requires additional forward passes with +almost zero computational overhead (in terms of training time and memory cost), +if the overhead is amortized over many iterations. We present extensive +experiments on language and vision tasks (e.g. GPT and ResNet) to showcase that +GeN optimizers match the state-of-the-art performance, which was achieved with +carefully tuned learning rate schedulers. Code to be released at +\url{https://github.com/ShiyunXu/AutoGeN}. + +
+
+
+
+
+ + ☆ Emotion and Intent Joint Understanding in Multimodal Conversation: A + Benchmarking Dataset NeurIPS 2024 + + +
+ Emotion and Intent Joint Understanding in Multimodal Conversation (MC-EIU) +aims to decode the semantic information manifested in a multimodal +conversational history, while inferring the emotions and intents simultaneously +for the current utterance. MC-EIU is enabling technology for many +human-computer interfaces. However, there is a lack of available datasets in +terms of annotation, modality, language diversity, and accessibility. In this +work, we propose an MC-EIU dataset, which features 7 emotion categories, 9 +intent categories, 3 modalities, i.e., textual, acoustic, and visual content, +and two languages, i.e., English and Mandarin. Furthermore, it is completely +open-source for free access. To our knowledge, MC-EIU is the first +comprehensive and rich emotion and intent joint understanding dataset for +multimodal conversation. Together with the release of the dataset, we also +develop an Emotion and Intent Interaction (EI$^2$) network as a reference +system by modeling the deep correlation between emotion and intent in the +multimodal conversation. With comparative experiments and ablation studies, we +demonstrate the effectiveness of the proposed EI$^2$ method on the MC-EIU +dataset. The dataset and codes will be made available at: +https://github.com/MC-EIU/MC-EIU. + +
+
+ comment: 26 pages, 8 figures, 12 tables, NeurIPS 2024 Dataset and Benchmark + Track +
+
+
+
+
+ + ☆ Learning to Reduce: Towards Improving Performance of Large Language + Models on Structured Data ICML 2024 + + +
+ Large Language Models (LLMs) have been achieving competent performance on a +wide range of downstream tasks, yet existing work shows that inference on +structured data is challenging for LLMs. This is because LLMs need to either +understand long structured data or select the most relevant evidence before +inference, and both approaches are not trivial. This paper proposes a +framework, Learning to Reduce, that fine-tunes a language model with On-Policy +Learning to generate a reduced version of an input structured data. When +compared to state-of-the-art LLMs like GPT-4, Learning to Reduce not only +achieves outstanding performance in reducing the input, but shows +generalizability on different datasets. We further show that the model +fine-tuned with our framework helps LLMs better perform on table QA tasks +especially when the context is longer. + +
+
+ comment: ICML 2024 Workshop on Long-Context Foundation Models, Vienna, Austria + 2024. arXiv admin note: substantial text overlap with arXiv:2402.14195 +
+
+
+
+
+ + ☆ A Comparative Study of DSL Code Generation: Fine-Tuning vs. Optimized + Retrieval Augmentation + + +
+ Natural Language to Code Generation has made significant progress in recent +years with the advent of Large Language Models(LLMs). While generation for +general-purpose languages like C, C++, and Python has improved significantly, +LLMs struggle with custom function names in Domain Specific Languages or DSLs. +This leads to higher hallucination rates and syntax errors, specially for DSLs +having a high number of custom function names. Additionally, constant updates +to function names add to the challenge as LLMs need to stay up-to-date. In this +paper, we present optimizations for using Retrieval Augmented Generation (or +RAG) with LLMs for DSL generation along with an ablation study comparing these +strategies. We generated a train as well as test dataset with a DSL to +represent automation tasks across roughly 700 APIs in public domain. We used +the training dataset to fine-tune a Codex model for this DSL. Our results +showed that the fine-tuned model scored the best on code similarity metric. +With our RAG optimizations, we achieved parity for similarity metric. The +compilation rate, however, showed that both the models still got the syntax +wrong many times, with RAG-based method being 2 pts better. Conversely, +hallucination rate for RAG model lagged by 1 pt for API names and by 2 pts for +API parameter keys. We conclude that an optimized RAG model can match the +quality of fine-tuned models and offer advantages for new, unseen APIs. + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ☆ MentalAgora: A Gateway to Advanced Personalized Care in Mental Health + through Multi-Agent Debating and Attribute Control + + +
+ As mental health issues globally escalate, there is a tremendous need for +advanced digital support systems. We introduce MentalAgora, a novel framework +employing large language models enhanced by interaction between multiple agents +for tailored mental health support. This framework operates through three +stages: strategic debating, tailored counselor creation, and response +generation, enabling the dynamic customization of responses based on individual +user preferences and therapeutic needs. We conduct experiments utilizing a +high-quality evaluation dataset TherapyTalk crafted with mental health +professionals, shwoing that MentalAgora generates expert-aligned and user +preference-enhanced responses. Our evaluations, including experiments and user +studies, demonstrate that MentalAgora aligns with professional standards and +effectively meets user preferences, setting a new benchmark for digital mental +health interventions. + +
+
+
+
+
+ + ☆ e-Health CSIRO at "Discharge Me!" 2024: Generating Discharge Summary + Sections with Fine-tuned Language Models ACL 2024 + + +
+ Clinical documentation is an important aspect of clinicians' daily work and +often demands a significant amount of time. The BioNLP 2024 Shared Task on +Streamlining Discharge Documentation (Discharge Me!) aims to alleviate this +documentation burden by automatically generating discharge summary sections, +including brief hospital course and discharge instruction, which are often +time-consuming to synthesize and write manually. We approach the generation +task by fine-tuning multiple open-sourced language models (LMs), including both +decoder-only and encoder-decoder LMs, with various configurations on input +context. We also examine different setups for decoding algorithms, model +ensembling or merging, and model specialization. Our results show that +conditioning on the content of discharge summary prior to the target sections +is effective for the generation task. Furthermore, we find that smaller +encoder-decoder LMs can work as well or even slightly better than larger +decoder based LMs fine-tuned through LoRA. The model checkpoints from our team +(aehrc) are openly available. + +
+
+ comment: BioNLP @ ACL 2024 +
+
+
+
+
+ + ☆ Boosting Biomedical Concept Extraction by Rule-Based Data Augmentation + + +
+ Document-level biomedical concept extraction is the task of identifying +biomedical concepts mentioned in a given document. Recent advancements have +adapted pre-trained language models for this task. However, the scarcity of +domain-specific data and the deviation of concepts from their canonical names +often hinder these models' effectiveness. To tackle this issue, we employ +MetaMapLite, an existing rule-based concept mapping system, to generate +additional pseudo-annotated data from PubMed and PMC. The annotated data are +used to augment the limited training data. Through extensive experiments, this +study demonstrates the utility of a manually crafted concept mapping tool for +training a better concept extraction model. + +
+
+
+
+
+ + ♻ ☆ Eraser: Jailbreaking Defense in Large Language Models via Unlearning + Harmful Knowledge + + +
+ Jailbreaking attacks can enable Large Language Models (LLMs) to bypass the +safeguard and generate harmful content. Existing jailbreaking defense methods +have failed to address the fundamental issue that harmful knowledge resides +within the model, leading to potential jailbreak risks for LLMs. In this paper, +we propose a novel defense method called Eraser, which mainly includes three +goals: unlearning harmful knowledge, retaining general knowledge, and +maintaining safety alignment. The intuition is that if an LLM forgets the +specific knowledge required to answer a harmful question, it will no longer +have the ability to answer harmful questions. The training of Erase does not +actually require the model's own harmful knowledge, and it can benefit from +unlearning general answers related to harmful queries, which means it does not +need assistance from the red team. The experimental results show that Eraser +can significantly reduce the jailbreaking success rate for various attacks +without compromising the general capabilities of the model. Our codes are +available at https://github.com/ZeroNLP/Eraser. + +
+
+
+
+
+ + ♻ ☆ Found in the Middle: Calibrating Positional Attention Bias Improves Long + Context Utilization ACL + + +
+ Large language models (LLMs), even when specifically trained to process long +input contexts, struggle to capture relevant information located in the middle +of their input. This phenomenon has been known as the lost-in-the-middle +problem. In this work, we make three contributions. First, we set out to +understand the factors that cause this phenomenon. In doing so, we establish a +connection between lost-in-the-middle to LLMs' intrinsic attention bias: LLMs +exhibit a U-shaped attention bias where the tokens at the beginning and at the +end of its input receive higher attention, regardless of their relevance. +Second, we mitigate this positional bias through a calibration mechanism, +found-in-the-middle, that allows the model to attend to contexts faithfully +according to their relevance, even though when they are in the middle. Third, +we show found-in-the-middle not only achieves better performance in locating +relevant information within a long context, but also eventually leads to +improved retrieval-augmented generation (RAG) performance across various tasks, +outperforming existing methods by up to 15 percentage points. These findings +open up future directions in understanding LLM attention bias and its potential +consequences. + +
+
+ comment: ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ DEEM: Diffusion Models Serve as the Eyes of Large Language Models for + Image Perception + + +
+ The development of large language models (LLMs) has significantly advanced +the emergence of large multimodal models (LMMs). While LMMs have achieved +tremendous success by promoting the synergy between multimodal comprehension +and creation, they often face challenges when confronted with +out-of-distribution data. This is primarily due to their reliance on image +encoders trained to encode images into task-relevant features, which may lead +them to disregard irrelevant details. Delving into the modeling capabilities of +diffusion models for images naturally prompts the question: Can diffusion +models serve as the eyes of large language models for image perception? In this +paper, we propose DEEM, a simple and effective approach that utilizes the +generative feedback of diffusion models to align the semantic distributions of +the image encoder. This addresses the drawbacks of previous methods that solely +relied on image encoders like ViT, thereby enhancing the model's resilience +against out-of-distribution samples and reducing visual hallucinations. +Importantly, this is achieved without requiring additional training modules and +with fewer training parameters. We extensively evaluated DEEM on both our newly +constructed RobustVQA benchmark and another well-known benchmark, POPE, for +object hallucination. Compared to the state-of-the-art interleaved content +generation models, DEEM exhibits enhanced robustness and a superior capacity to +alleviate model hallucinations while utilizing fewer trainable parameters, less +pre-training data (10%), and a smaller base model size. + +
+
+ comment: 25 pages. arXiv admin note: text overlap with arXiv:2401.10208 by + other authors +
+
+
+
+
+ + ♻ ☆ CaLMQA: Exploring culturally specific long-form question answering + across 23 languages + + +
+ Large language models (LLMs) are used for long-form question answering +(LFQA), which requires them to generate paragraph-length answers to complex +questions. While LFQA has been well-studied in English, this research has not +been extended to other languages. To bridge this gap, we introduce CaLMQA, a +collection of 1.5K complex culturally specific questions spanning 23 languages +and 51 culturally agnostic questions translated from English into 22 other +languages. We define culturally specific questions as those uniquely or more +likely to be asked by people from cultures associated with the question's +language. We collect naturally-occurring questions from community web forums +and hire native speakers to write questions to cover under-resourced, +rarely-studied languages such as Fijian and Kirundi. Our dataset contains +diverse, complex questions that reflect cultural topics (e.g. traditions, laws, +news) and the language usage of native speakers. We automatically evaluate a +suite of open- and closed-source models on CaLMQA by detecting incorrect +language and token repetitions in answers, and observe that the quality of +LLM-generated answers degrades significantly for some low-resource languages. +Lastly, we perform human evaluation on a subset of models and languages. Manual +evaluation reveals that model performance is significantly worse for culturally +specific questions than for culturally agnostic questions. Our findings +highlight the need for further research in non-English LFQA and provide an +evaluation framework. + +
+
+ comment: 39 pages, 17 figures. Code and data available at + https://github.com/2015aroras/CaLMQA. Revised argument in section 4, results + unchanged +
+
+
+
+
+ + ♻ ☆ Protecting Privacy in Classifiers by Token Manipulation ACL 2024 + + +
+ Using language models as a remote service entails sending private information +to an untrusted provider. In addition, potential eavesdroppers can intercept +the messages, thereby exposing the information. In this work, we explore the +prospects of avoiding such data exposure at the level of text manipulation. We +focus on text classification models, examining various token mapping and +contextualized manipulation functions in order to see whether classifier +accuracy may be maintained while keeping the original text unrecoverable. We +find that although some token mapping functions are easy and straightforward to +implement, they heavily influence performance on the downstream task, and via a +sophisticated attacker can be reconstructed. In comparison, the contextualized +manipulation provides an improvement in performance. + +
+
+ comment: PrivateNLP@ACL 2024 +
+
+
+
+
+ + ♻ ☆ Temperature-scaling surprisal estimates improve fit to human reading + times -- but does it do so for the "right reasons"? ACL 2024 + + +
+ A wide body of evidence shows that human language processing difficulty is +predicted by the information-theoretic measure surprisal, a word's negative log +probability in context. However, it is still unclear how to best estimate these +probabilities needed for predicting human processing difficulty -- while a +long-standing belief held that models with lower perplexity would provide more +accurate estimates of word predictability, and therefore lead to better reading +time predictions, recent work has shown that for very large models, +psycholinguistic predictive power decreases. One reason could be that language +models might be more confident of their predictions than humans, because they +have had exposure to several magnitudes more data. In this paper, we test what +effect temperature-scaling of large language model (LLM) predictions has on +surprisal estimates and their predictive power of reading times of English +texts. Firstly, we show that calibration of large language models typically +improves with model size, i.e. poorer calibration cannot account for poorer fit +to reading times. Secondly, we find that temperature-scaling probabilities lead +to a systematically better fit to reading times (up to 89% improvement in delta +log likelihood), across several reading time corpora. Finally, we show that +this improvement in fit is chiefly driven by words that are composed of +multiple subword tokens. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating LLMs' Inherent Multi-hop Reasoning Ability + + +
+ While Large Language Models (LLMs) excel in question-answering (QA) tasks, +their multi-step reasoning abilities on multiple evidence integration on +Multi-hop QA tasks remain underexplored. LLMs sometimes generate answers that +rely on internal memory rather than reasoning given context, which brings +concerns about the evaluation quality of real reasoning abilities. The +counterfactual QA task can separate internal memory from reasoning abilities, +but focusing solely on final-QA performance without evaluating the multi-step +reasoning process is insufficient for reporting LLMs' real reasoning abilities. +Current Multi-hop QA (MHQA) benchmarks are factual and annotated on open-source +corpora such as Wikipedia, although useful for multi-step reasoning evaluation, +showing limitations due to potential data contamination in LLMs pre-training +stage. To address this issue, we introduce the Inherent Reasoning Evaluation +(IRE) method, a novel evaluation way that jointly evaluates the LLMs' +chain-of-reasoning performance based on the first knowledge-edited +counterfactual multi-hop QA data which involves editing the original Wikipedia +passages, reducing data contamination risks. The IRE comprehensively assesses +reasoning chains through sub-QA and final-QA evaluations. Our comparisons +reveal significant performance gaps for several LLMs between Wikipedia-based +benchmarks and IRE, deeming data contamination issues in existing benchmarks. +We believe that the IRE benchmark will enhance and facilitate trustworthy LLM +evaluations. + +
+
+
+
+
+ + ♻ ☆ Using Natural Language Processing and Networks to Automate Structured + Literature Reviews: An Application to Farmers Climate Change Adaptation + + +
+ The fast-growing number of research articles makes it problematic for +scholars to keep track of the new findings related to their areas of expertise. +Furthermore, linking knowledge across disciplines in rapidly developing fields +becomes challenging for complex topics like climate change that demand +interdisciplinary solutions. At the same time, the rise of Black Box types of +text summarization makes it difficult to understand how text relationships are +built, let alone relate to existing theories conceptualizing cause-effect +relationships and permitting hypothesizing. This work aims to sensibly use +Natural Language Processing by extracting variables relations and synthesizing +their findings using networks while relating to key concepts dominant in +relevant disciplines. As an example, we apply our methodology to the analysis +of farmers' adaptation to climate change. For this, we perform a Natural +Language Processing analysis of publications returned by Scopus in August 2022. +Results show that the use of Natural Language Processing together with networks +in a descriptive manner offers a fast and interpretable way to synthesize +literature review findings as long as researchers back up results with theory. + +
+
+
+
+
+ + ♻ ☆ Call Me When Necessary: LLMs can Efficiently and Faithfully Reason over + Structured Environments ACL 2024 + + +
+ Large Language Models (LLMs) have shown potential in reasoning over +structured environments, e.g., knowledge graph and table. Such tasks typically +require multi-hop reasoning, i.e., match natural language utterance with +instances in the environment. Previous methods leverage LLMs to incrementally +build a reasoning path, where the LLMs either invoke tools or pick up schemas +by step-by-step interacting with the environment. We propose +Reasoning-Path-Editing (Readi), a novel framework where LLMs can efficiently +and faithfully reason over structured environments. In Readi, LLMs initially +generate a reasoning path given a query, and edit the path only when necessary. +We instantiate the path on structured environments and provide feedback to edit +the path if anything goes wrong. Experimental results on three KGQA and two +TableQA datasets show the effectiveness of Readi, significantly surpassing +previous LLM-based methods (by 9.1% Hit@1 on WebQSP, 12.4% on MQA-3H and 9.5% +on WTQ), comparable with state-of-the-art fine-tuned methods (67% on CWQ and +74.7% on WebQSP) and substantially boosting the vanilla LLMs (by 14.9% on CWQ). +Our code will be available on https://aka.ms/readi. + +
+
+ comment: Accepted by ACL 2024 Findings. 21 pages, 7 figures, 17 tables +
+
+
+
+
+ + ♻ ☆ HRDE: Retrieval-Augmented Large Language Models for Chinese Health Rumor + Detection and Explainability + + +
+ As people increasingly prioritize their health, the speed and breadth of +health information dissemination on the internet have also grown. At the same +time, the presence of false health information (health rumors) intermingled +with genuine content poses a significant potential threat to public health. +However, current research on Chinese health rumors still lacks a large-scale, +public, and open-source dataset of health rumor information, as well as +effective and reliable rumor detection methods. This paper addresses this gap +by constructing a dataset containing 1.12 million health-related rumors +(HealthRCN) through web scraping of common health-related questions and a +series of data processing steps. HealthRCN is the largest known dataset of +Chinese health information rumors to date. Based on this dataset, we propose +retrieval-augmented large language models for Chinese health rumor detection +and explainability (HRDE). This model leverages retrieved relevant information +to accurately determine whether the input health information is a rumor and +provides explanatory responses, effectively aiding users in verifying the +authenticity of health information. In evaluation experiments, we compared +multiple models and found that HRDE outperformed them all, including +GPT-4-1106-Preview, in rumor detection accuracy and answer quality. HRDE +achieved an average accuracy of 91.04% and an F1 score of 91.58%. + +
+
+
+
+
+ + ♻ ☆ Who Wrote this Code? Watermarking for Code Generation ACL 2024 + + +
+ Since the remarkable generation performance of large language models raised +ethical and legal concerns, approaches to detect machine-generated text by +embedding watermarks are being developed. However, we discover that the +existing works fail to function appropriately in code generation tasks due to +the task's nature of having low entropy. Extending a logit-modifying watermark +method, we propose Selective WatErmarking via Entropy Thresholding (SWEET), +which enhances detection ability and mitigates code quality degeneration by +removing low-entropy segments at generating and detecting watermarks. Our +experiments show that SWEET significantly improves code quality preservation +while outperforming all baselines, including post-hoc detection methods, in +detecting machine-generated code text. Our code is available in +https://github.com/hongcheki/sweet-watermark. + +
+
+ comment: To be presented at ACL 2024 +
+
+
+
+
+ + ♻ ☆ Mixture of insighTful Experts (MoTE): The Synergy of Thought Chains and + Expert Mixtures in Self-Alignment + + +
+ As the capabilities of large language models (LLMs) have expanded +dramatically, aligning these models with human values presents a significant +challenge. Traditional alignment strategies rely heavily on human intervention, +such as Supervised Fine-Tuning (SFT) and Reinforcement Learning from Human +Feedback (RLHF), or on the self-alignment capacities of LLMs, which usually +require a strong LLM's emergent ability to improve its original bad answer. To +address these challenges, we propose a novel self-alignment method that +utilizes a Chain of Thought (CoT) approach, termed AlignCoT. This method +encompasses stages of Question Analysis, Answer Guidance, and Safe Answer +production. It is designed to enable LLMs to generate high-quality, safe +responses throughout various stages of their development. Furthermore, we +introduce the Mixture of insighTful Experts (MoTE) architecture, which applies +mixture of experts to enhance each component of the AlignCoT process, markedly +increasing alignment efficiency. The MoTE approach not only outperforms +existing methods in aligning LLMs with human values but also highlights the +benefits of using self-generated data, revealing the dual benefits of improved +alignment and training efficiency. + +
+
+
+
+
+ + ♻ ☆ Towards Human-AI Collaboration in Healthcare: Guided Deferral Systems + with Large Language Models ICML 2024 + + +
+ Large language models (LLMs) present a valuable technology for various +applications in healthcare, but their tendency to hallucinate introduces +unacceptable uncertainty in critical decision-making situations. Human-AI +collaboration (HAIC) can mitigate this uncertainty by combining human and AI +strengths for better outcomes. This paper presents a novel guided deferral +system that provides intelligent guidance when AI defers cases to human +decision-makers. We leverage LLMs' verbalisation capabilities and internal +states to create this system, demonstrating that fine-tuning small-scale LLMs +with data from large-scale LLMs greatly enhances performance while maintaining +computational efficiency and data privacy. A pilot study showcases the +effectiveness of our proposed deferral system. + +
+
+ comment: Accepted to ICML 2024 Workshop on Large Language Models and Cognition +
+
+
+
+
+ + ♻ ☆ LLMs can learn self-restraint through iterative self-reflection + + +
+ In order to be deployed safely, Large Language Models (LLMs) must be capable +of dynamically adapting their behavior based on their level of knowledge and +uncertainty associated with specific topics. This adaptive behavior, which we +refer to as self-restraint, is non-trivial to teach since it depends on the +internal knowledge of an LLM. By default, LLMs are trained to maximize the next +token likelihood, which does not teach the model to modulate its answer based +on its level of uncertainty. In order to learn self-restraint, we devise a +utility function that can encourage the model to produce responses only when it +is confident in them. This utility function can be used to score generation of +different length and abstention. To optimize this function, we introduce +ReSearch, a process of "self-reflection" consisting of iterative self-prompting +and self-evaluation. We use the ReSearch algorithm to generate synthetic data +on which we finetune our models. Compared to their original versions, our +resulting models generate fewer \emph{hallucinations} overall at no additional +inference cost, for both known and unknown topics, as the model learns to +selectively restrain itself. In addition, our method elegantly incorporates the +ability to abstain by augmenting the samples generated by the model during the +search procedure with an answer expressing abstention. + +
+
+
+
+
+ + ♻ ☆ Jamba: A Hybrid Transformer-Mamba Language Model + + +
+ We present Jamba, a new base large language model based on a novel hybrid +Transformer-Mamba mixture-of-experts (MoE) architecture. Specifically, Jamba +interleaves blocks of Transformer and Mamba layers, enjoying the benefits of +both model families. MoE is added in some of these layers to increase model +capacity while keeping active parameter usage manageable. This flexible +architecture allows resource- and objective-specific configurations. In the +particular configuration we have implemented, we end up with a powerful model +that fits in a single 80GB GPU. Built at large scale, Jamba provides high +throughput and small memory footprint compared to vanilla Transformers, and at +the same time state-of-the-art performance on standard language model +benchmarks and long-context evaluations. Remarkably, the model presents strong +results for up to 256K tokens context length. We study various architectural +decisions, such as how to combine Transformer and Mamba layers, and how to mix +experts, and show that some of them are crucial in large scale modeling. We +also describe several interesting properties of these architectures which the +training and evaluation of Jamba have revealed, and plan to release checkpoints +from various ablation runs, to encourage further exploration of this novel +architecture. We make the weights of our implementation of Jamba publicly +available under a permissive license. + +
+
+ comment: Webpage: https://www.ai21.com/jamba +
+
+
+
+
+ + ♻ ☆ Mind the Privacy Unit! User-Level Differential Privacy for Language + Model Fine-Tuning + + +
+ Large language models (LLMs) have emerged as powerful tools for tackling +complex tasks across diverse domains, but they also raise privacy concerns when +fine-tuned on sensitive data due to potential memorization. While differential +privacy (DP) offers a promising solution by ensuring models are 'almost +indistinguishable' with or without any particular privacy unit, current +evaluations on LLMs mostly treat each example (text record) as the privacy +unit. This leads to uneven user privacy guarantees when contributions per user +vary. We therefore study user-level DP motivated by applications where it +necessary to ensure uniform privacy protection across users. We present a +systematic evaluation of user-level DP for LLM fine-tuning on natural language +generation tasks. Focusing on two mechanisms for achieving user-level DP +guarantees, Group Privacy and User-wise DP-SGD, we investigate design choices +like data selection strategies and parameter tuning for the best +privacy-utility tradeoff. + +
+
+
+
+
+ + ♻ ☆ Four Ways to Improve Verbo-visual Fusion for Dense 3D Visual Grounding ICCV 2023 + + +
+ 3D visual grounding is the task of localizing the object in a 3D scene which +is referred by a description in natural language. With a wide range of +applications ranging from autonomous indoor robotics to AR/VR, the task has +recently risen in popularity. A common formulation to tackle 3D visual +grounding is grounding-by-detection, where localization is done via bounding +boxes. However, for real-life applications that require physical interactions, +a bounding box insufficiently describes the geometry of an object. We therefore +tackle the problem of dense 3D visual grounding, i.e. referral-based 3D +instance segmentation. We propose a dense 3D grounding network ConcreteNet, +featuring four novel stand-alone modules that aim to improve grounding +performance for challenging repetitive instances, i.e. instances with +distractors of the same semantic class. First, we introduce a bottom-up +attentive fusion module that aims to disambiguate inter-instance relational +cues, next, we construct a contrastive training scheme to induce separation in +the latent space, we then resolve view-dependent utterances via a learned +global camera token, and finally we employ multi-view ensembling to improve +referred mask quality. ConcreteNet ranks 1st on the challenging ScanRefer +online benchmark and has won the ICCV 3rd Workshop on Language for 3D Scenes +"3D Object Localization" challenge. + +
+
+ comment: Winner of the ICCV 2023 ScanRefer Challenge. Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Noise Contrastive Alignment of Language Models with Explicit Rewards + + +
+ User intentions are typically formalized as evaluation rewards to be +maximized when fine-tuning language models (LMs). Existing alignment methods, +such as Direct Preference Optimization (DPO), are mainly tailored for pairwise +preference data where rewards are implicitly defined rather than explicitly +given. In this paper, we introduce a general framework for LM alignment, +leveraging Noise Contrastive Estimation (NCE) to bridge the gap in handling +reward datasets explicitly annotated with scalar evaluations. Our framework +comprises two parallel algorithms, NCA and InfoNCA, both enabling the direct +extraction of an LM policy from reward data as well as preference data. +Notably, we show that the DPO loss is a special case of our proposed InfoNCA +objective under pairwise preference settings, thereby integrating and extending +current alignment theories. By comparing NCA and InfoNCA, we demonstrate that +the well-observed decreasing-likelihood trend of DPO/InfoNCA is caused by their +focus on adjusting relative likelihood across different responses. In contrast, +NCA optimizes the absolute likelihood for each response, thereby effectively +preventing the chosen likelihood from decreasing. We evaluate our methods in +both reward and preference settings with Mistral-8*7B and 7B models. +Experiments suggest that InfoNCA/NCA surpasses various preference baselines +when reward datasets are available. We also find NCA significantly outperforms +DPO in complex reasoning tasks like math and coding. + +
+
+
+
+
+ + ♻ ☆ Smaug: Fixing Failure Modes of Preference Optimisation with DPO-Positive + + +
+ Direct Preference Optimisation (DPO) is effective at significantly improving +the performance of large language models (LLMs) on downstream tasks such as +reasoning, summarisation, and alignment. Using pairs of preferred and +dispreferred data, DPO models the relative probability of picking one response +over another. In this work, first we show theoretically that the standard DPO +loss can lead to a reduction of the model's likelihood of the preferred +examples, as long as the relative probability between the preferred and +dispreferred classes increases. We then show empirically that this phenomenon +occurs when fine-tuning LLMs on common datasets, especially datasets in which +the edit distance between pairs of completions is low. Using these insights, we +design DPO-Positive (DPOP), a new loss function and training procedure which +avoids this failure mode. Surprisingly, we find that DPOP outperforms DPO and +other fine-tuning procedures across a wide variety of datasets and downstream +tasks, including datasets with high edit distances between completions. +Furthermore, we find that the DPOP-tuned model outperforms the DPO-tuned model +(all else equal) on benchmarks independent of the fine-tuning data, such as +MT-Bench. Finally, using DPOP, we create and open-source Smaug-34B and +Smaug-72B, with the latter becoming the first open-source LLM to surpass an +average accuracy of 80% on the HuggingFace Open LLM Leaderboard. + +
+
+
+
+
+ + ♻ ☆ LLMs' Classification Performance is Overclaimed + + +
+ In many classification tasks designed for AI or human to solve, gold labels +are typically included within the label space by default, often posed as "which +of the following is correct?" This standard setup has traditionally highlighted +the strong performance of advanced AI, particularly top-performing Large +Language Models (LLMs), in routine classification tasks. However, when the gold +label is intentionally excluded from the label space, it becomes evident that +LLMs still attempt to select from the available label candidates, even when +none are correct. This raises a pivotal question: Do LLMs truly demonstrate +their intelligence in understanding the essence of classification tasks? + In this study, we evaluate both closed-source and open-source LLMs across +representative classification tasks, arguing that the perceived performance of +LLMs is overstated due to their inability to exhibit the expected comprehension +of the task. This paper makes a threefold contribution: i) To our knowledge, +this is the first work to identify the limitations of LLMs in classification +tasks when gold labels are absent. We define this task as Classify-w/o-Gold and +propose it as a new testbed for LLMs. ii) We introduce a benchmark, Know-No, +comprising two existing classification tasks and one new task, to evaluate +Classify-w/o-Gold. iii) This work defines and advocates for a new evaluation +metric, OmniAccuracy, which assesses LLMs' performance in classification tasks +both when gold labels are present and absent. + +
+
+
+
+
+ + ♻ ☆ LLM-Oracle Machines + + +
+ Contemporary AI applications leverage large language models (LLMs) to harness +their knowledge and reasoning abilities for natural language processing tasks. +This approach shares similarities with the concept of oracle Turing machines +(OTMs). To capture the broader potential of these computations, including those +not yet realized, we propose an extension to OTMs: the LLM-oracle machine +(LLM-OM), by employing a cluster of LLMs as the oracle. Each LLM acts as a +black box, capable of answering queries within its expertise, albeit with a +delay. We introduce four variants of the LLM-OM: basic, augmented, +fault-avoidance, and $\epsilon$-fault. The first two are commonly observed in +existing AI applications. The latter two are specifically designed to address +the challenges of LLM hallucinations, biases, and inconsistencies, aiming to +ensure reliable outcomes. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ PharmaGPT: Domain-Specific Large Language Models for Bio-Pharmaceutical + and Chemistry + + +
+ Large language models (LLMs) have revolutionized Natural Language Processing +(NLP) by by minimizing the need for complex feature engineering. However, the +application of LLMs in specialized domains like biopharmaceuticals and +chemistry remains largely unexplored. These fields are characterized by +intricate terminologies, specialized knowledge, and a high demand for precision +areas where general purpose LLMs often fall short. In this study, we introduce +PharmGPT, a suite of multilingual LLMs with 13 billion and 70 billion +parameters, specifically trained on a comprehensive corpus of hundreds of +billions of tokens tailored to the Bio-Pharmaceutical and Chemical sectors. Our +evaluation shows that PharmGPT matches or surpasses existing general models on +key benchmarks, such as NAPLEX, demonstrating its exceptional capability in +domain-specific tasks. This advancement establishes a new benchmark for LLMs in +the Bio-Pharmaceutical and Chemical fields, addressing the existing gap in +specialized language modeling. Furthermore, this suggests a promising path for +enhanced research and development in these specialized areas, paving the way +for more precise and effective applications of NLP in specialized domains. + +
+
+
+
+
+ + ♻ ☆ Face4RAG: Factual Consistency Evaluation for Retrieval Augmented + Generation in Chinese + + +
+ The prevailing issue of factual inconsistency errors in conventional +Retrieval Augmented Generation (RAG) motivates the study of Factual Consistency +Evaluation (FCE). Despite the various FCE methods proposed earlier, these +methods are evaluated on datasets generated by specific Large Language Models +(LLMs). Without a comprehensive benchmark, it remains unexplored how these FCE +methods perform on other LLMs with different error distributions or even unseen +error types, as these methods may fail to detect the error types generated by +other LLMs. To fill this gap, in this paper, we propose the first comprehensive +FCE benchmark \emph{Face4RAG} for RAG independent of the underlying LLM. Our +benchmark consists of a synthetic dataset built upon a carefully designed +typology for factuality inconsistency error and a real-world dataset +constructed from six commonly used LLMs, enabling evaluation of FCE methods on +specific error types or real-world error distributions. On the proposed +benchmark, we discover the failure of existing FCE methods to detect the +logical fallacy, which refers to a mismatch of logic structures between the +answer and the retrieved reference. To fix this issue, we further propose a new +method called \emph{L-Face4RAG} with two novel designs of logic-preserving +answer decomposition and fact-logic FCE. Extensive experiments show L-Face4RAG +substantially outperforms previous methods for factual inconsistency detection +on a wide range of tasks, notably beyond the RAG task from which it is +originally motivated. Both the benchmark and our proposed method are publicly +available.\footnote{\url{https://huggingface.co/datasets/yq27/Face4RAG}\label{link_face4rag}} + +
+
+
+
+
+ + ♻ ☆ LACIE: Listener-Aware Finetuning for Confidence Calibration in Large + Language Models + + +
+ When answering questions, LLMs can convey not only an answer, but a level of +confidence about the answer being correct. This includes explicit confidence +markers (e.g. giving a numeric score) as well as implicit markers, like an +authoritative tone or elaborating with additional knowledge. For LLMs to be +trustworthy knowledge sources, the confidence they convey should match their +actual expertise; however, most current models tend towards overconfidence. To +calibrate both implicit and explicit confidence markers, we introduce a +pragmatic, listener-aware finetuning method (LACIE) that models the listener, +considering not only whether an answer is right, but whether it will be +accepted by a listener. We cast calibration as preference optimization, +creating data via a two-agent game, where a speaker model's outputs are judged +by a simulated listener. We then finetune three LLMs (Mistral-7B, Llama3-8B, +Llama3-70B) with LACIE, and show that the resulting models are better +calibrated w.r.t. a simulated listener. Crucially, these trends transfer to +human listeners, helping them correctly predict model correctness: we conduct a +human evaluation where annotators accept or reject an LLM's answers, finding +that training with LACIE results in 47% fewer incorrect answers being accepted +while maintaining the same level of acceptance for correct answers. +Furthermore, LACIE generalizes to another dataset, resulting in a large +increase in truthfulness on TruthfulQA when trained on TriviaQA. Our analysis +indicates that LACIE leads to a better confidence separation between correct +and incorrect examples. Qualitatively, we find that a LACIE-trained model +hedges more and implicitly signals certainty when it is correct by using an +authoritative tone or including details. Finally, LACIE finetuning leads to an +emergent increase in model abstention (e.g. saying "I don't know") for answers +that are likely wrong. + +
+
+ comment: 18 pages. Code: https://github.com/esteng/pragmatic_calibration +
+
+
+
+
+ + ♻ ☆ Towards Semantically Enriched Embeddings for Knowledge Graph Completion + + +
+ Embedding based Knowledge Graph (KG) Completion has gained much attention +over the past few years. Most of the current algorithms consider a KG as a +multidirectional labeled graph and lack the ability to capture the semantics +underlying the schematic information. In a separate development, a vast amount +of information has been captured within the Large Language Models (LLMs) which +has revolutionized the field of Artificial Intelligence. KGs could benefit from +these LLMs and vice versa. This vision paper discusses the existing algorithms +for KG completion based on the variations for generating KG embeddings. It +starts with discussing various KG completion algorithms such as transductive +and inductive link prediction and entity type prediction algorithms. It then +moves on to the algorithms utilizing type information within the KGs, LLMs, and +finally to algorithms capturing the semantics represented in different +description logic axioms. We conclude the paper with a critical reflection on +the current state of work in the community and give recommendations for future +directions. + +
+
+
+
+
+ + ♻ ☆ When Benchmarks are Targets: Revealing the Sensitivity of Large Language + Model Leaderboards ACL 2024 + + +
+ Large Language Model (LLM) leaderboards based on benchmark rankings are +regularly used to guide practitioners in model selection. Often, the published +leaderboard rankings are taken at face value - we show this is a (potentially +costly) mistake. Under existing leaderboards, the relative performance of LLMs +is highly sensitive to (often minute) details. We show that for popular +multiple-choice question benchmarks (e.g., MMLU), minor perturbations to the +benchmark, such as changing the order of choices or the method of answer +selection, result in changes in rankings up to 8 positions. We explain this +phenomenon by conducting systematic experiments over three broad categories of +benchmark perturbations and identifying the sources of this behavior. Our +analysis results in several best-practice recommendations, including the +advantage of a hybrid scoring method for answer selection. Our study highlights +the dangers of relying on simple benchmark evaluations and charts the path for +more robust evaluation schemes on the existing benchmarks. The code for this +paper is available at +https://github.com/National-Center-for-AI-Saudi-Arabia/lm-evaluation-harness. + +
+
+ comment: updated with ACL 2024 camera ready version +
+
+
+
+
+ + ♻ ☆ MedExQA: Medical Question Answering Benchmark with Multiple Explanations ACL2024 + + +
+ This paper introduces MedExQA, a novel benchmark in medical +question-answering, to evaluate large language models' (LLMs) understanding of +medical knowledge through explanations. By constructing datasets across five +distinct medical specialties that are underrepresented in current datasets and +further incorporating multiple explanations for each question-answer pair, we +address a major gap in current medical QA benchmarks which is the absence of +comprehensive assessments of LLMs' ability to generate nuanced medical +explanations. Our work highlights the importance of explainability in medical +LLMs, proposes an effective methodology for evaluating models beyond +classification accuracy, and sheds light on one specific domain, speech +language pathology, where current LLMs including GPT4 lack good understanding. +Our results show generation evaluation with multiple explanations aligns better +with human assessment, highlighting an opportunity for a more robust automated +comprehension assessment for LLMs. To diversify open-source medical LLMs +(currently mostly based on Llama2), this work also proposes a new medical +model, MedPhi-2, based on Phi-2 (2.7B). The model outperformed medical LLMs +based on Llama2-70B in generating explanations, showing its effectiveness in +the resource-constrained medical domain. We will share our benchmark datasets +and the trained model. + +
+
+ comment: Accepted to ACL2024 BioNLP Workshop +
+
+
+
+
+ + ♻ ☆ uDistil-Whisper: Label-Free Data Filtering for Knowledge Distillation + via Large-Scale Pseudo Labelling + + +
+ Recent work on distilling Whisper's knowledge into small models using +pseudo-labels shows promising performance while reducing the size by up to +50\%. This results in small, efficient, and dedicated models. However, a +critical step of distillation from pseudo-labels involves filtering +high-quality predictions and using only those during training. This step +requires ground truth to compare and filter bad examples making the whole +process supervised. In addition to that, the distillation process requires a +large amount of data thereby limiting the ability to distil models in +low-resource settings. To address this challenge, we propose an unsupervised or +label-free framework for distillation, thus eliminating the requirement for +labeled data altogether. Through experimentation, we show that our +best-distilled models outperform the teacher model by 5-7 points in terms of +WER. Additionally, our models are on par with or better than similar supervised +data filtering setup. When we scale the data, our models significantly +outperform all zero-shot and supervised models. We demonstrate that it is +possible to distill large Whisper models into relatively small models without +using any labeled data. Our distilled models are 25-50\% more compute and +memory efficient while maintaining performance equal to or better than the +teacher model. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Delving into ChatGPT usage in academic writing through excess vocabulary + + +
+ Recent large language models (LLMs) can generate and revise text with +human-level performance, and have been widely commercialized in systems like +ChatGPT. These models come with clear limitations: they can produce inaccurate +information, reinforce existing biases, and be easily misused. Yet, many +scientists have been using them to assist their scholarly writing. How +wide-spread is LLM usage in the academic literature currently? To answer this +question, we use an unbiased, large-scale approach, free from any assumptions +on academic LLM usage. We study vocabulary changes in 14 million PubMed +abstracts from 2010-2024, and show how the appearance of LLMs led to an abrupt +increase in the frequency of certain style words. Our analysis based on excess +words usage suggests that at least 10% of 2024 abstracts were processed with +LLMs. This lower bound differed across disciplines, countries, and journals, +and was as high as 30% for some PubMed sub-corpora. We show that the appearance +of LLM-based writing assistants has had an unprecedented impact in the +scientific literature, surpassing the effect of major world events such as the +Covid pandemic. + +
+
+ comment: v2: Updating dataset, figures and numbers to include all PubMed + abstracts until end of June 2024 +
+
+
+
+
+ + ♻ ☆ DocCGen: Document-based Controlled Code Generation + + +
+ Recent developments show that Large Language Models (LLMs) produce +state-of-the-art performance on natural language (NL) to code generation for +resource-rich general-purpose languages like C++, Java, and Python. However, +their practical usage for structured domain-specific languages (DSLs) such as +YAML, JSON is limited due to domain-specific schema, grammar, and +customizations generally unseen by LLMs during pre-training. Efforts have been +made to mitigate this challenge via in-context learning through relevant +examples or by fine-tuning. However, it suffers from problems, such as limited +DSL samples and prompt sensitivity but enterprises maintain good documentation +of the DSLs. Therefore, we propose DocCGen, a framework that can leverage such +rich knowledge by breaking the NL-to-Code generation task for structured code +languages into a two-step process. First, it detects the correct libraries +using the library documentation that best matches the NL query. Then, it +utilizes schema rules extracted from the documentation of these libraries to +constrain the decoding. We evaluate our framework for two complex structured +languages, Ansible YAML and Bash command, consisting of two settings: +Out-of-domain (OOD) and In-domain (ID). Our extensive experiments show that +DocCGen consistently improves different-sized language models across all six +evaluation metrics, reducing syntactic and semantic errors in structured code. +We plan to open-source the datasets and code to motivate research in +constrained code generation. + +
+
+
+
+
+ + ♻ ☆ BLINK: Multimodal Large Language Models Can See but Not Perceive ECCV 2024 + + +
+ We introduce Blink, a new benchmark for multimodal language models (LLMs) +that focuses on core visual perception abilities not found in other +evaluations. Most of the Blink tasks can be solved by humans "within a blink" +(e.g., relative depth estimation, visual correspondence, forensics detection, +and multi-view reasoning). However, we find these perception-demanding tasks +cast significant challenges for current multimodal LLMs because they resist +mediation through natural language. Blink reformats 14 classic computer vision +tasks into 3,807 multiple-choice questions, paired with single or multiple +images and visual prompting. While humans get 95.70% accuracy on average, Blink +is surprisingly challenging for existing multimodal LLMs: even the +best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only +13.17% and 7.63% higher than random guessing, indicating that such perception +abilities have not "emerged" yet in recent multimodal LLMs. Our analysis also +highlights that specialist CV models could solve these problems much better, +suggesting potential pathways for future improvements. We believe Blink will +stimulate the community to help multimodal LLMs catch up with human-level +visual perception. + +
+
+ comment: Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/, + ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Fine-tuning Large Language Models with Sequential Instructions + + +
+ Despite the success of existing instruction-tuned models, we find that they +usually struggle to respond to queries with multiple instructions. This impairs +their performance in complex problems whose solution consists of multiple +intermediate tasks. Thus, we contend that part of the fine-tuning data mixture +should be sequential--containing a chain of interrelated tasks. We first +approach sequential instruction tuning from a task-driven perspective, manually +creating interpretable intermediate tasks for multilingual and visual question +answering: namely "translate then predict" and "caption then answer". Next, we +automate this process by turning instructions in existing datasets (e.g., +Alpaca and FlanCoT) into diverse and complex sequential instructions, making +our method general-purpose. Models that underwent our sequential instruction +tuning show improved results in coding, maths, and open-ended generation. +Moreover, we put forward a new benchmark named SeqEval to evaluate a model's +ability to follow all the instructions in a sequence, which further +corroborates the benefits of our fine-tuning method. We hope that our +endeavours will open new research avenues on instruction tuning for complex +tasks. + +
+
+ comment: 21pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Pistis-RAG: A Scalable Cascading Framework Towards Content-Centric + Retrieval-Augmented Generation + + +
+ In Greek mythology, Pistis symbolized good faith, trust, and reliability. +Drawing inspiration from these principles, Pistis-RAG is a scalable multi-stage +framework designed to address the challenges of large-scale retrieval-augmented +generation (RAG) systems. This framework consists of distinct stages: matching, +pre-ranking, ranking, reasoning, and aggregating. Each stage contributes to +narrowing the search space, prioritizing semantically relevant documents, +aligning with the large language model's (LLM) preferences, supporting complex +chain-of-thought (CoT) methods, and combining information from multiple +sources. + Our ranking stage introduces a significant innovation by recognizing that +semantic relevance alone may not lead to improved generation quality, due to +the sensitivity of the few-shot prompt order, as noted in previous research. +This critical aspect is often overlooked in current RAG frameworks. + We argue that the alignment issue between LLMs and external knowledge ranking +methods is tied to the model-centric paradigm dominant in RAG systems. We +propose a content-centric approach, emphasizing seamless integration between +LLMs and external information sources to optimize content transformation for +specific tasks. + Our novel ranking stage is designed specifically for RAG systems, +incorporating principles of information retrieval while considering the unique +business scenarios reflected in LLM preferences and user feedback. We simulated +feedback signals on the MMLU benchmark, resulting in a 9.3% performance +improvement. Our model and code will be open-sourced on GitHub. Additionally, +experiments on real-world, large-scale data validate the scalability of our +framework. + +
+
+
+
+
+ + ♻ ☆ GraphWiz: An Instruction-Following Language Model for Graph Problems + + +
+ Large language models (LLMs) have achieved impressive success across several +fields, but their proficiency in understanding and resolving complex graph +problems is less explored. To bridge this gap, we introduce GraphInstruct, a +novel and comprehensive instruction-tuning dataset designed to equip language +models with the ability to tackle a broad spectrum of graph problems using +explicit reasoning paths. Utilizing GraphInstruct, we build GraphWiz, an +open-source language model capable of resolving various graph problem types +while generating clear reasoning processes. To enhance the model's capability +and reliability, we incorporate the Direct Preference Optimization (DPO) +framework into the graph problem-solving context. The enhanced model, +GraphWiz-DPO, achieves an average accuracy of 65% across nine tasks with +different complexity levels, surpassing GPT-4 which has an average accuracy of +43.8%. Moreover, our research delves into the delicate balance between training +data volume and model performance, highlighting the potential for overfitting +with increased data. We also explore the transferability of the model's +reasoning ability across different graph tasks, indicating the model's +adaptability and practical application potential. Our investigation offers a +new blueprint and valuable insights for developing LLMs specialized in graph +reasoning and problem-solving. + +
+
+ comment: 27pages, 15 tables +
+
+
+
+
+ + ♻ ☆ Learning and Forgetting Unsafe Examples in Large Language Models ICML 24 + + +
+ As the number of large language models (LLMs) released to the public grows, +there is a pressing need to understand the safety implications associated with +these models learning from third-party custom finetuning data. We explore the +behavior of LLMs finetuned on noisy custom data containing unsafe content, +represented by datasets that contain biases, toxicity, and harmfulness, finding +that while aligned LLMs can readily learn this unsafe content, they also tend +to forget it more significantly than other examples when subsequently finetuned +on safer content. Drawing inspiration from the discrepancies in forgetting, we +introduce the "ForgetFilter" algorithm, which filters unsafe data based on how +strong the model's forgetting signal is for that data. We demonstrate that the +ForgetFilter algorithm ensures safety in customized finetuning without +compromising downstream task performance, unlike sequential safety finetuning. +ForgetFilter outperforms alternative strategies like replay and moral +self-correction in curbing LLMs' ability to assimilate unsafe content during +custom finetuning, e.g. 75% lower than not applying any safety measures and 62% +lower than using self-correction in toxicity score. + +
+
+ comment: accepted by ICML 24 +
+
+
+
+
+ + ♻ ☆ ProSparse: Introducing and Enhancing Intrinsic Activation Sparsity + within Large Language Models + + +
+ Activation sparsity refers to the existence of considerable +weakly-contributed elements among activation outputs. As a prevalent property +of the models using the ReLU activation function, activation sparsity has been +proven a promising paradigm to boost model inference efficiency. Nevertheless, +most large language models (LLMs) adopt activation functions without intrinsic +activation sparsity (e.g., GELU and Swish). Some recent efforts have explored +introducing ReLU or its variants as the substitutive activation function to +help LLMs achieve activation sparsity and inference acceleration, but few can +simultaneously obtain high sparsity and comparable model performance. This +paper introduces a simple and effective sparsification method named "ProSparse" +to push LLMs for higher activation sparsity while maintaining comparable +performance. Specifically, after substituting the activation function of LLMs +with ReLU, ProSparse adopts progressive sparsity regularization with a factor +smoothly increasing along the multi-stage sine curves. This can enhance +activation sparsity and mitigate performance degradation by avoiding radical +shifts in activation distributions. With ProSparse, we obtain high sparsity of +89.32% for LLaMA2-7B, 88.80% for LLaMA2-13B, and 87.89% for end-size +MiniCPM-1B, respectively, achieving comparable performance to their original +Swish-activated versions. These present the most sparsely activated models +among open-source LLaMA versions and competitive end-size models, considerably +surpassing ReluLLaMA-7B (66.98%) and ReluLLaMA-13B (71.56%). Our inference +acceleration experiments further demonstrate the significant practical +acceleration potential of LLMs with higher activation sparsity, obtaining up to +4.52$\times$ inference speedup. + +
+
+ comment: 19 pages, 4 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Synergizing In-context Learning with Hints for End-to-end Task-oriented + Dialog Systems + + +
+ End-to-end Task-Oriented Dialog (TOD) systems typically require extensive +training datasets to perform well. In contrast, large language model (LLM) +based TOD systems can excel even with limited data due to their ability to +learn tasks through in-context exemplars. However, these models lack alignment +with the style of responses in training data and often generate comprehensive +responses, making it difficult for users to grasp the information quickly. In +response, we propose SyncTOD that synergizes LLMs with task-specific hints to +improve alignment in low-data settings. SyncTOD employs small auxiliary models +to provide hints and select exemplars for in-context prompts. With ChatGPT, +SyncTOD achieves superior performance compared to LLM-based baselines and SoTA +models in low-data settings, while retaining competitive performance in +full-data settings. + +
+
+
+
+
+ + ♻ ☆ Aligning Large Language Models with Human Preferences through + Representation Engineering + + +
+ Aligning large language models (LLMs) with human preferences is crucial for +enhancing their utility in terms of helpfulness, truthfulness, safety, +harmlessness, and interestingness. Existing methods for achieving this +alignment often involves employing reinforcement learning from human feedback +(RLHF) to fine-tune LLMs based on human labels assessing the relative quality +of model responses. Nevertheless, RLHF is susceptible to instability during +fine-tuning and presents challenges in implementation.Drawing inspiration from +the emerging field of representation engineering (RepE), this study aims to +identify relevant representations for high-level human preferences embedded in +patterns of activity within an LLM, and achieve precise control of model +behavior by transforming its representations. This novel approach, denoted as +Representation Alignment from Human Feedback (RAHF), proves to be effective, +computationally efficient, and easy to implement.Extensive experiments +demonstrate the efficacy of RAHF in not only capturing but also manipulating +representations to align with a broad spectrum of human preferences or values, +rather than being confined to a singular concept or function (e.g. honesty or +bias). RAHF's versatility in accommodating diverse human preferences shows its +potential for advancing LLM performance. + +
+
+
+
+
+ + ♻ ☆ Evaluation of Retrieval-Augmented Generation: A Survey + + +
+ Retrieval-Augmented Generation (RAG) has recently gained traction in natural +language processing. Numerous studies and real-world applications are +leveraging its ability to enhance generative models through external +information retrieval. Evaluating these RAG systems, however, poses unique +challenges due to their hybrid structure and reliance on dynamic knowledge +sources. To better understand these challenges, we conduct A Unified Evaluation +Process of RAG (Auepora) and aim to provide a comprehensive overview of the +evaluation and benchmarks of RAG systems. Specifically, we examine and compare +several quantifiable metrics of the Retrieval and Generation components, such +as relevance, accuracy, and faithfulness, within the current RAG benchmarks, +encompassing the possible output and ground truth pairs. We then analyze the +various datasets and metrics, discuss the limitations of current benchmarks, +and suggest potential directions to advance the field of RAG benchmarks. + +
+
+
+
+
+ + ♻ ☆ DistiLLM: Towards Streamlined Distillation for Large Language Models ICML 2024 + + +
+ Knowledge distillation (KD) is widely used for compressing a teacher model to +a smaller student model, reducing its inference cost and memory footprint while +preserving model capabilities. However, current KD methods for auto-regressive +sequence models (e.g., large language models) suffer from missing a +standardized objective function. Moreover, the recent use of student-generated +outputs to address training-inference mismatches has significantly escalated +computational costs. To tackle these issues, we introduce DistiLLM, a more +effective and efficient KD framework for auto-regressive language models. +DistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence +loss, where we unveil and leverage its theoretical properties, and (2) an +adaptive off-policy approach designed to enhance the efficiency in utilizing +student-generated outputs. Extensive experiments, including +instruction-following tasks, demonstrate the effectiveness of DistiLLM in +building high-performing student models while achieving up to 4.3$\times$ +speedup compared to recent KD methods. + +
+
+ comment: ICML 2024; Code is available at https://github.com/jongwooko/distillm +
+
+
+
+
+ + ♻ ☆ Decomposition for Enhancing Attention: Improving LLM-based Text-to-SQL + through Workflow Paradigm + + +
+ In-context learning of large-language models (LLMs) has achieved remarkable +success in the field of natural language processing, while extensive case +studies reveal that the single-step chain-of-thought prompting approach faces +challenges such as attention diffusion and inadequate performance in complex +tasks like text-to-SQL. To improve the contextual learning capabilities of LLMs +in text-to-SQL, a workflow paradigm method is proposed, aiming to enhance the +attention and problem-solving scope of LLMs through decomposition. +Specifically, the information determination module for eliminating redundant +information and the brand-new prompt structure based on problem classification +greatly enhance the model's attention. Additionally, the inclusion of +self-correction and active learning modules greatly expands the problem-solving +scope of LLMs, hence improving the upper limit of LLM-based approaches. +Extensive experiments conducted on three datasets demonstrate that our approach +outperforms other methods by a significant margin. About 2-3 percentage point +improvements compared to the existing baseline on the Spider Dev, +Spider-Realistic, and Bird Dev datasets and new SOTA results on the Spider Test +dataset are achieved. Our code is available on GitHub: +\url{https://github.com/FlyingFeather/DEA-SQL}. + +
+
+
+
+
+ + ♻ ☆ PRD: Peer Rank and Discussion Improve Large Language Model based + Evaluations + + +
+ Nowadays, the quality of responses generated by different modern large +language models (LLMs) is hard to evaluate and compare automatically. Recent +studies suggest and predominantly use LLMs for reference-free evaluation of +open-ended question answering. More specifically, they use the recognized +"strongest" LLM as the evaluator, which conducts pairwise comparisons of +candidate models' answers and provides a ranking score. However, this intuitive +method has multiple problems, such as bringing in self-enhancement (favoring +its own answers) and positional bias. We draw insights and lessons from the +educational domain (Cho & MacArthur, 2011; Walsh, 2014) to improve LLM-based +evaluations. Specifically, we propose (1) the peer rank (PR) algorithm that +takes into account each peer LLM's pairwise preferences of all answer pairs, +and outputs a final ranking of models; and (2) peer discussion (PD), where we +prompt two LLMs to discuss and try to reach a mutual agreement on the +preferences of two answers. We conduct experiments on two benchmark datasets. +We find that our approaches achieve higher accuracy and align better with human +judgments. Interestingly, PR can induce a relatively accurate self-ranking of +models under the anonymous setting, where each model's name is unrevealed. Our +work provides space to explore evaluating models that are hard to compare for +humans. + +
+
+ comment: Accepted by TMLR +
+
+
+
+
+ + ♻ ☆ Reasoning Runtime Behavior of a Program with LLM: How Far Are We? ICSE 2025 + + +
+ Large language models for code (i.e., code LLMs) have shown strong code +understanding and generation capabilities. To evaluate the capabilities of code +LLMs in various aspects, many benchmarks have been proposed (e.g., HumanEval +and ClassEval). Code reasoning is one of the most essential abilities of code +LLMs, but existing benchmarks for code reasoning are not sufficient. Typically, +they focus on predicting the input and output of a program, ignoring the +evaluation of the intermediate behavior during program execution, as well as +the logical consistency (e.g., the model should not give the correct output if +the prediction of execution path is wrong) when performing the reasoning. To +address these problems, in this paper, we propose a framework, namely REval, +for evaluating code reasoning abilities and consistency of code LLMs with +program execution. We utilize existing code benchmarks and adapt them to new +benchmarks within our framework. A large-scale empirical study is conducted and +most LLMs show unsatisfactory performance on both Runtime Behavior Reasoning +(i.e., an average accuracy of 44.4%) and Incremental Consistency Evaluation +(i.e., an average IC score of 10.3). Evaluation results of current code LLMs +reflect the urgent need for the community to strengthen the code reasoning +capability of code LLMs. Our code, data, and \newname leaderboard are available +at https://r-eval.github.io. + +
+
+ comment: Accepted by ICSE 2025 and this is our preprint version. Our REval + leaderboard is available at https://r-eval.github.io +
+
+
+
+
+ + ♻ ☆ Losing Visual Needles in Image Haystacks: Vision Language Models are + Easily Distracted in Short and Long Contexts + + +
+ We present LoCoVQA, a dynamic benchmark generator for evaluating long-context +extractive reasoning in vision language models (VLMs). LoCoVQA augments test +examples for mathematical reasoning, VQA, and character recognition tasks with +increasingly long visual contexts composed of both in-distribution and +out-of-distribution distractor images. + Across these tasks, a diverse set of VLMs rapidly lose performance as the +visual context length grows, often exhibiting a striking logarithmic decay +trend. This test assesses how well VLMs can ignore irrelevant information when +answering queries -- a task that is quite easy for language models (LMs) in the +text domain -- demonstrating that current state-of-the-art VLMs lack this +essential capability for many long-context applications. + +
+
+ comment: Under review. Minor errata correction in revision +
+
+
+
+
+ + ♻ ☆ Transformers and Cortical Waves: Encoders for Pulling In Context Across + Time + + +
+ The capabilities of transformer networks such as ChatGPT and other Large +Language Models (LLMs) have captured the world's attention. The crucial +computational mechanism underlying their performance relies on transforming a +complete input sequence - for example, all the words in a sentence - into a +long "encoding vector" that allows transformers to learn long-range temporal +dependencies in naturalistic sequences. Specifically, "self-attention" applied +to this encoding vector enhances temporal context in transformers by computing +associations between pairs of words in the input sequence. We suggest that +waves of neural activity traveling across single cortical areas or multiple +regions at the whole-brain scale could implement a similar encoding principle. +By encapsulating recent input history into a single spatial pattern at each +moment in time, cortical waves may enable temporal context to be extracted from +sequences of sensory inputs, the same computational principle used in +transformers. + +
+
+ comment: 25 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ FRoG: Evaluating Fuzzy Reasoning of Generalized Quantifiers in Large + Language Models + + +
+ Fuzzy reasoning is vital due to the frequent use of imprecise information in +daily contexts. However, the ability of current large language models (LLMs) to +handle such reasoning remains largely uncharted. In this paper, we introduce a +new benchmark, FRoG, for fuzzy reasoning, featuring real-world mathematical +word problems that incorporate generalized quantifiers. Our experimental +findings reveal that fuzzy reasoning continues to pose significant challenges +for LLMs. Moreover, we find that existing methods designed to enhance reasoning +do not consistently improve performance in tasks involving fuzzy logic. +Additionally, our results show an inverse scaling effect in the performance of +LLMs on FRoG. Interestingly, we also demonstrate that strong mathematical +reasoning skills are not necessarily indicative of success on our benchmark. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Injecting linguistic knowledge into BERT for Dialogue State Tracking + + +
+ Dialogue State Tracking (DST) models often employ intricate neural network +architectures, necessitating substantial training data, and their inference +process lacks transparency. This paper proposes a method that extracts +linguistic knowledge via an unsupervised framework and subsequently utilizes +this knowledge to augment BERT's performance and interpretability in DST tasks. +The knowledge extraction procedure is computationally economical and does not +require annotations or additional training data. The injection of the extracted +knowledge can be achieved by the addition of simple neural modules. We employ +the Convex Polytopic Model (CPM) as a feature extraction tool for DST tasks and +illustrate that the acquired features correlate with syntactic and semantic +patterns in the dialogues. This correlation facilitates a comprehensive +understanding of the linguistic features influencing the DST model's +decision-making process. We benchmark this framework on various DST tasks and +observe a notable improvement in accuracy. + +
+
+ comment: Accepted for publication at IEEE Access +
+
+
+
+
+ + ♻ ☆ Sketch-Guided Constrained Decoding for Boosting Blackbox Large Language + Models without Logit Access ACL 2024 + + +
+ Constrained decoding, a technique for enforcing constraints on language model +outputs, offers a way to control text generation without retraining or +architectural modifications. Its application is, however, typically restricted +to models that give users access to next-token distributions (usually via +softmax logits), which poses a limitation with blackbox large language models +(LLMs). This paper introduces sketch-guided constrained decoding (SGCD), a +novel approach to constrained decoding for blackbox LLMs, which operates +without access to the logits of the blackbox LLM. SGCD utilizes a locally +hosted auxiliary model to refine the output of an unconstrained blackbox LLM, +effectively treating this initial output as a "sketch" for further elaboration. +This approach is complementary to traditional logit-based techniques and +enables the application of constrained decoding in settings where full model +transparency is unavailable. We demonstrate the efficacy of SGCD through +experiments in closed information extraction and constituency parsing, showing +how it enhances the utility and flexibility of blackbox LLMs for complex NLP +tasks. + +
+
+ comment: Accepted to ACL 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Enabling Discriminative Reasoning in LLMs for Legal Judgment Prediction + + +
+ Legal judgment prediction is essential for enhancing judicial efficiency. In +this work, we identify that existing large language models (LLMs) underperform +in this domain due to challenges in understanding case complexities and +distinguishing between similar charges. To adapt LLMs for effective legal +judgment prediction, we introduce the Ask-Discriminate-Predict (ADAPT) +reasoning framework inspired by human judicial reasoning. ADAPT involves +decomposing case facts, discriminating among potential charges, and predicting +the final judgment. We further enhance LLMs through fine-tuning with multi-task +synthetic trajectories to improve legal judgment prediction accuracy and +efficiency under our ADAPT framework. Extensive experiments conducted on two +widely-used datasets demonstrate the superior performance of our framework in +legal judgment prediction, particularly when dealing with complex and confusing +charges. + +
+
+
+
+
+ + ♻ ☆ A Framework For Refining Text Classification and Object Recognition from + Academic Articles + + +
+ With the widespread use of the internet, it has become increasingly crucial +to extract specific information from vast amounts of academic articles +efficiently. Data mining techniques are generally employed to solve this issue. +However, data mining for academic articles is challenging since it requires +automatically extracting specific patterns in complex and unstructured layout +documents. Current data mining methods for academic articles employ +rule-based(RB) or machine learning(ML) approaches. However, using rule-based +methods incurs a high coding cost for complex typesetting articles. On the +other hand, simply using machine learning methods requires annotation work for +complex content types within the paper, which can be costly. Furthermore, only +using machine learning can lead to cases where patterns easily recognized by +rule-based methods are mistakenly extracted. To overcome these issues, from the +perspective of analyzing the standard layout and typesetting used in the +specified publication, we emphasize implementing specific methods for specific +characteristics in academic articles. We have developed a novel Text Block +Refinement Framework (TBRF), a machine learning and rule-based scheme hybrid. +We used the well-known ACL proceeding articles as experimental data for the +validation experiment. The experiment shows that our approach achieved over 95% +classification accuracy and 90% detection accuracy for tables and figures. + +
+
+ comment: This paper has been accepted at 'The International Symposium on + Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)' +
+
+
+
+
+ + ♻ ☆ Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey + + +
+ Research surveys have always posed a challenge for beginner researchers who +lack of research training. These researchers struggle to understand the +directions within their research topic, and the discovery of new research +findings within a short time. One way to provide intuitive assistance to +beginner researchers is by offering relevant knowledge graphs(KG) and +recommending related academic papers. However, existing navigation knowledge +graphs primarily rely on keywords in the research field and often fail to +present the logical hierarchy among multiple related papers clearly. Moreover, +most recommendation systems for academic papers simply rely on high text +similarity, which can leave researchers confused as to why a particular article +is being recommended. They may lack of grasp important information about the +insight connection between "Issue resolved" and "Issue finding" that they hope +to obtain. To address these issues, this study aims to support research insight +surveys for beginner researchers by establishing a hierarchical tree-structured +knowledge graph that reflects the inheritance insight of research topics and +the relevance insight among the academic papers. + +
+
+ comment: This paper has been accepted by 'The 18TH International Conference on + INnovations in Intelligent SysTems and Applications (INISTA 2024)' +
+
+
+
+
+ + ♻ ☆ Prompt Engineering a Prompt Engineer ACL 2024 + + +
+ Prompt engineering is a challenging yet crucial task for optimizing the +performance of large language models on customized tasks. It requires complex +reasoning to examine the model's errors, hypothesize what is missing or +misleading in the current prompt, and communicate the task with clarity. While +recent works indicate that large language models can be meta-prompted to +perform automatic prompt engineering, we argue that their potential is limited +due to insufficient guidance for complex reasoning in the meta-prompt. We fill +this gap by infusing into the meta-prompt three key components: detailed +descriptions, context specification, and a step-by-step reasoning template. The +resulting method, named PE2, exhibits remarkable versatility across diverse +language tasks. It finds prompts that outperform "let's think step by step" by +6.3% on MultiArith and 3.1% on GSM8K, and outperforms competitive baselines on +counterfactual tasks by 6.9%. Further, we show that PE2 can make targeted and +highly specific prompt edits, rectify erroneous prompts, and induce multi-step +plans for complex tasks. + +
+
+ comment: Accepted to ACL 2024 Findings. Camera-ready version +
+
+
+
+
+ + ♻ ☆ Learning Action Conditions from Instructional Manuals for Instruction + Understanding + + +
+ The ability to infer pre- and postconditions of an action is vital for +comprehending complex instructions, and is essential for applications such as +autonomous instruction-guided agents and assistive AI that supports humans to +perform physical tasks. In this work, we propose a task dubbed action condition +inference, and collecting a high-quality, human annotated dataset of +preconditions and postconditions of actions in instructional manuals. We +propose a weakly supervised approach to automatically construct large-scale +training instances from online instructional manuals, and curate a densely +human-annotated and validated dataset to study how well the current NLP models +can infer action-condition dependencies in the instruction texts. We design two +types of models differ by whether contextualized and global information is +leveraged, as well as various combinations of heuristics to construct the weak +supervisions. Our experimental results show a >20% F1-score improvement with +considering the entire instruction contexts and a >6% F1-score benefit with the +proposed heuristics. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ InternLM-XComposer-2.5: A Versatile Large Vision Language Model + Supporting Long-Contextual Input and Output + + +
+ We present InternLM-XComposer-2.5 (IXC-2.5), a versatile large-vision +language model that supports long-contextual input and output. IXC-2.5 excels +in various text-image comprehension and composition applications, achieving +GPT-4V level capabilities with merely 7B LLM backend. Trained with 24K +interleaved image-text contexts, it can seamlessly extend to 96K long contexts +via RoPE extrapolation. This long-context capability allows IXC-2.5 to excel in +tasks requiring extensive input and output contexts. Compared to its previous +2.0 version, InternLM-XComposer-2.5 features three major upgrades in +vision-language comprehension: (1) Ultra-High Resolution Understanding, (2) +Fine-Grained Video Understanding, and (3) Multi-Turn Multi-Image Dialogue. In +addition to comprehension, IXC-2.5 extends to two compelling applications using +extra LoRA parameters for text-image composition: (1) Crafting Webpages and (2) +Composing High-Quality Text-Image Articles. IXC-2.5 has been evaluated on 28 +benchmarks, outperforming existing open-source state-of-the-art models on 16 +benchmarks. It also surpasses or competes closely with GPT-4V and Gemini Pro on +16 key tasks. The InternLM-XComposer-2.5 is publicly available at +https://github.com/InternLM/InternLM-XComposer. + +
+
+ comment: Technical Report. https://github.com/InternLM/InternLM-XComposer +
+
+
+
+
+ + ☆ BACON: Supercharge Your VLM with Bag-of-Concept Graph to Mitigate + Hallucinations + + +
+ This paper presents Bag-of-Concept Graph (BACON) to gift models with limited +linguistic abilities to taste the privilege of Vision Language Models (VLMs) +and boost downstream tasks such as detection, visual question answering (VQA), +and image generation. Since the visual scenes in physical worlds are structured +with complex relations between objects, BACON breaks down annotations into +basic minimum elements and presents them in a graph structure. Element-wise +style enables easy understanding, and structural composition liberates +difficult locating. Careful prompt design births the BACON captions with the +help of public-available VLMs and segmentation methods. In this way, we gather +a dataset with 100K annotated images, which endow VLMs with remarkable +capabilities, such as accurately generating BACON, transforming prompts into +BACON format, envisioning scenarios in the style of BACONr, and dynamically +modifying elements within BACON through interactive dialogue and more. Wide +representative experiments, including detection, VQA, and image generation +tasks, tell BACON as a lifeline to achieve previous out-of-reach tasks or excel +in their current cutting-edge solutions. + +
+
+
+
+
+ + ☆ HoloHisto: End-to-end Gigapixel WSI Segmentation with 4K Resolution + Sequential Tokenization + + +
+ In digital pathology, the traditional method for deep learning-based image +segmentation typically involves a two-stage process: initially segmenting +high-resolution whole slide images (WSI) into smaller patches (e.g., 256x256, +512x512, 1024x1024) and subsequently reconstructing them to their original +scale. This method often struggles to capture the complex details and vast +scope of WSIs. In this paper, we propose the holistic histopathology +(HoloHisto) segmentation method to achieve end-to-end segmentation on gigapixel +WSIs, whose maximum resolution is above 80,000$\times$70,000 pixels. HoloHisto +fundamentally shifts the paradigm of WSI segmentation to an end-to-end learning +fashion with 1) a large (4K) resolution base patch for elevated visual +information inclusion and efficient processing, and 2) a novel sequential +tokenization mechanism to properly model the contextual relationships and +efficiently model the rich information from the 4K input. To our best +knowledge, HoloHisto presents the first holistic approach for gigapixel +resolution WSI segmentation, supporting direct I/O of complete WSI and their +corresponding gigapixel masks. Under the HoloHisto platform, we unveil a random +4K sampler that transcends ultra-high resolution, delivering 31 and 10 times +more pixels than standard 2D and 3D patches, respectively, for advancing +computational capabilities. To facilitate efficient 4K resolution dense +prediction, we leverage sequential tokenization, utilizing a pre-trained image +tokenizer to group image features into a discrete token grid. To assess the +performance, our team curated a new kidney pathology image segmentation (KPIs) +dataset with WSI-level glomeruli segmentation from whole mouse kidneys. From +the results, HoloHisto-4K delivers remarkable performance gains over previous +state-of-the-art models. + +
+
+
+
+
+ + ☆ Smart City Surveillance Unveiling Indian Person Attributes in Real Time + + +
+ This project focuses on creating a smart surveillance system for Indian +cities that can identify and analyze people's attributes in real time. Using +advanced technologies like artificial intelligence and machine learning, the +system can recognize attributes such as upper body color, what the person is +wearing, accessories they are wearing, headgear, etc., and analyze behavior +through cameras installed around the city. + +
+
+ comment: 6 pages , 8 figure +
+
+
+
+
+ + ☆ DisCo-Diff: Enhancing Continuous Diffusion Models with Discrete Latents + + +
+ Diffusion models (DMs) have revolutionized generative learning. They utilize +a diffusion process to encode data into a simple Gaussian distribution. +However, encoding a complex, potentially multimodal data distribution into a +single continuous Gaussian distribution arguably represents an unnecessarily +challenging learning problem. We propose Discrete-Continuous Latent Variable +Diffusion Models (DisCo-Diff) to simplify this task by introducing +complementary discrete latent variables. We augment DMs with learnable discrete +latents, inferred with an encoder, and train DM and encoder end-to-end. +DisCo-Diff does not rely on pre-trained networks, making the framework +universally applicable. The discrete latents significantly simplify learning +the DM's complex noise-to-data mapping by reducing the curvature of the DM's +generative ODE. An additional autoregressive transformer models the +distribution of the discrete latents, a simple step because DisCo-Diff requires +only few discrete variables with small codebooks. We validate DisCo-Diff on toy +data, several image synthesis tasks as well as molecular docking, and find that +introducing discrete latents consistently improves model performance. For +example, DisCo-Diff achieves state-of-the-art FID scores on class-conditioned +ImageNet-64/128 datasets with ODE sampler. + +
+
+ comment: project page: https://research.nvidia.com/labs/lpr/disco-diff +
+
+
+
+
+ + ☆ Improved Noise Schedule for Diffusion Training + + +
+ Diffusion models have emerged as the de facto choice for generating visual +signals. However, training a single model to predict noise across various +levels poses significant challenges, necessitating numerous iterations and +incurring significant computational costs. Various approaches, such as loss +weighting strategy design and architectural refinements, have been introduced +to expedite convergence. In this study, we propose a novel approach to design +the noise schedule for enhancing the training of diffusion models. Our key +insight is that the importance sampling of the logarithm of the Signal-to-Noise +ratio (logSNR), theoretically equivalent to a modified noise schedule, is +particularly beneficial for training efficiency when increasing the sample +frequency around $\log \text{SNR}=0$. We empirically demonstrate the +superiority of our noise schedule over the standard cosine schedule. +Furthermore, we highlight the advantages of our noise schedule design on the +ImageNet benchmark, showing that the designed schedule consistently benefits +different prediction targets. + +
+
+
+
+
+ + ☆ Biomechanics-informed Non-rigid Medical Image Registration and its + Inverse Material Property Estimation with Linear and Nonlinear Elasticity + + +
+ This paper investigates both biomechanical-constrained non-rigid medical +image registrations and accurate identifications of material properties for +soft tissues, using physics-informed neural networks (PINNs). The complex +nonlinear elasticity theory is leveraged to formally establish the partial +differential equations (PDEs) representing physics laws of biomechanical +constraints that need to be satisfied, with which registration and +identification tasks are treated as forward (i.e., data-driven solutions of +PDEs) and inverse (i.e., parameter estimation) problems under PINNs +respectively. Two net configurations (i.e., Cfg1 and Cfg2) have also been +compared for both linear and nonlinear physics model. Two sets of experiments +have been conducted, using pairs of undeformed and deformed MR images from +clinical cases of prostate cancer biopsy. + Our contributions are summarised as follows. 1) We developed a learning-based +biomechanical-constrained non-rigid registration algorithm using PINNs, where +linear elasticity is generalised to the nonlinear version. 2) We demonstrated +extensively that nonlinear elasticity shows no statistical significance against +linear models in computing point-wise displacement vectors but their respective +benefits may depend on specific patients, with finite-element (FE) computed +ground-truth. 3) We formulated and solved the inverse parameter estimation +problem, under the joint optimisation scheme of registration and parameter +identification using PINNs, whose solutions can be accurately found by locating +saddle points. + +
+
+
+
+
+ + ☆ VCHAR:Variance-Driven Complex Human Activity Recognition framework with + Generative Representation + + +
+ Complex human activity recognition (CHAR) remains a pivotal challenge within +ubiquitous computing, especially in the context of smart environments. Existing +studies typically require meticulous labeling of both atomic and complex +activities, a task that is labor-intensive and prone to errors due to the +scarcity and inaccuracies of available datasets. Most prior research has +focused on datasets that either precisely label atomic activities or, at +minimum, their sequence approaches that are often impractical in real world +settings.In response, we introduce VCHAR (Variance-Driven Complex Human +Activity Recognition), a novel framework that treats the outputs of atomic +activities as a distribution over specified intervals. Leveraging generative +methodologies, VCHAR elucidates the reasoning behind complex activity +classifications through video-based explanations, accessible to users without +prior machine learning expertise. Our evaluation across three publicly +available datasets demonstrates that VCHAR enhances the accuracy of complex +activity recognition without necessitating precise temporal or sequential +labeling of atomic activities. Furthermore, user studies confirm that VCHAR's +explanations are more intelligible compared to existing methods, facilitating a +broader understanding of complex activity recognition among non-experts. + +
+
+
+
+
+ + ☆ For a semiotic AI: Bridging computer vision and visual semiotics for + computational observation of large scale facial image archives + + +
+ Social networks are creating a digital world in which the cognitive, +emotional, and pragmatic value of the imagery of human faces and bodies is +arguably changing. However, researchers in the digital humanities are often +ill-equipped to study these phenomena at scale. This work presents FRESCO (Face +Representation in E-Societies through Computational Observation), a framework +designed to explore the socio-cultural implications of images on social media +platforms at scale. FRESCO deconstructs images into numerical and categorical +variables using state-of-the-art computer vision techniques, aligning with the +principles of visual semiotics. The framework analyzes images across three +levels: the plastic level, encompassing fundamental visual features like lines +and colors; the figurative level, representing specific entities or concepts; +and the enunciation level, which focuses particularly on constructing the point +of view of the spectator and observer. These levels are analyzed to discern +deeper narrative layers within the imagery. Experimental validation confirms +the reliability and utility of FRESCO, and we assess its consistency and +precision across two public datasets. Subsequently, we introduce the FRESCO +score, a metric derived from the framework's output that serves as a reliable +measure of similarity in image content. + +
+
+
+
+
+ + ☆ A Unified Framework for 3D Scene Understanding + + +
+ We propose UniSeg3D, a unified 3D segmentation framework that achieves +panoptic, semantic, instance, interactive, referring, and open-vocabulary +semantic segmentation tasks within a single model. Most previous 3D +segmentation approaches are specialized for a specific task, thereby limiting +their understanding of 3D scenes to a task-specific perspective. In contrast, +the proposed method unifies six tasks into unified representations processed by +the same Transformer. It facilitates inter-task knowledge sharing and, +therefore, promotes comprehensive 3D scene understanding. To take advantage of +multi-task unification, we enhance the performance by leveraging task +connections. Specifically, we design a knowledge distillation method and a +contrastive learning method to transfer task-specific knowledge across +different tasks. Benefiting from extensive inter-task knowledge sharing, our +UniSeg3D becomes more powerful. Experiments on three benchmarks, including the +ScanNet20, ScanRefer, and ScanNet200, demonstrate that the UniSeg3D +consistently outperforms current SOTA methods, even those specialized for +individual tasks. We hope UniSeg3D can serve as a solid unified baseline and +inspire future work. The code will be available at +https://dk-liang.github.io/UniSeg3D/. + +
+
+ comment: The code will be available at https://dk-liang.github.io/UniSeg3D/ +
+
+
+
+
+ + ☆ ACTRESS: Active Retraining for Semi-supervised Visual Grounding + + +
+ Semi-Supervised Visual Grounding (SSVG) is a new challenge for its sparse +labeled data with the need for multimodel understanding. A previous study, +RefTeacher, makes the first attempt to tackle this task by adopting the +teacher-student framework to provide pseudo confidence supervision and +attention-based supervision. However, this approach is incompatible with +current state-of-the-art visual grounding models, which follow the +Transformer-based pipeline. These pipelines directly regress results without +region proposals or foreground binary classification, rendering them unsuitable +for fitting in RefTeacher due to the absence of confidence scores. Furthermore, +the geometric difference in teacher and student inputs, stemming from different +data augmentations, induces natural misalignment in attention-based +constraints. To establish a compatible SSVG framework, our paper proposes the +ACTive REtraining approach for Semi-Supervised Visual Grounding, abbreviated as +ACTRESS. Initially, the model is enhanced by incorporating an additional +quantized detection head to expose its detection confidence. Building upon +this, ACTRESS consists of an active sampling strategy and a selective +retraining strategy. The active sampling strategy iteratively selects +high-quality pseudo labels by evaluating three crucial aspects: Faithfulness, +Robustness, and Confidence, optimizing the utilization of unlabeled data. The +selective retraining strategy retrains the model with periodic +re-initialization of specific parameters, facilitating the model's escape from +local minima. Extensive experiments demonstrates our superior performance on +widely-used benchmark datasets. + +
+
+
+
+
+ + ☆ Visual Grounding with Attention-Driven Constraint Balancing + + +
+ Unlike Object Detection, Visual Grounding task necessitates the detection of +an object described by complex free-form language. To simultaneously model such +complex semantic and visual representations, recent state-of-the-art studies +adopt transformer-based models to fuse features from both modalities, further +introducing various modules that modulate visual features to align with the +language expressions and eliminate the irrelevant redundant information. +However, their loss function, still adopting common Object Detection losses, +solely governs the bounding box regression output, failing to fully optimize +for the above objectives. To tackle this problem, in this paper, we first +analyze the attention mechanisms of transformer-based models. Building upon +this, we further propose a novel framework named Attention-Driven Constraint +Balancing (AttBalance) to optimize the behavior of visual features within +language-relevant regions. Extensive experimental results show that our method +brings impressive improvements. Specifically, we achieve constant improvements +over five different models evaluated on four different benchmarks. Moreover, we +attain a new state-of-the-art performance by integrating our method into QRNet. + +
+
+
+
+
+ + ☆ Cyclic Refiner: Object-Aware Temporal Representation Learning for + Multi-View 3D Detection and Tracking + + +
+ We propose a unified object-aware temporal learning framework for multi-view +3D detection and tracking tasks. Having observed that the efficacy of the +temporal fusion strategy in recent multi-view perception methods may be +weakened by distractors and background clutters in historical frames, we +propose a cyclic learning mechanism to improve the robustness of multi-view +representation learning. The essence is constructing a backward bridge to +propagate information from model predictions (e.g., object locations and sizes) +to image and BEV features, which forms a circle with regular inference. After +backward refinement, the responses of target-irrelevant regions in historical +frames would be suppressed, decreasing the risk of polluting future frames and +improving the object awareness ability of temporal fusion. We further tailor an +object-aware association strategy for tracking based on the cyclic learning +model. The cyclic learning model not only provides refined features, but also +delivers finer clues (e.g., scale level) for tracklet association. The proposed +cycle learning method and association module together contribute a novel and +unified multi-task framework. Experiments on nuScenes show that the proposed +model achieves consistent performance gains over baselines of different designs +(i.e., dense query-based BEVFormer, sparse query-based SparseBEV and LSS-based +BEVDet4D) on both detection and tracking evaluation. + +
+
+ comment: Accepted by IJCV +
+
+
+
+
+ + ☆ Solving the inverse problem of microscopy deconvolution with a residual + Beylkin-Coifman-Rokhlin neural network + + +
+ Optic deconvolution in light microscopy (LM) refers to recovering the object +details from images, revealing the ground truth of samples. Traditional +explicit methods in LM rely on the point spread function (PSF) during image +acquisition. Yet, these approaches often fall short due to inaccurate PSF +models and noise artifacts, hampering the overall restoration quality. In this +paper, we approached the optic deconvolution as an inverse problem. Motivated +by the nonstandard-form compression scheme introduced by Beylkin, Coifman, and +Rokhlin (BCR), we proposed an innovative physics-informed neural network +Multi-Stage Residual-BCR Net (m-rBCR) to approximate the optic deconvolution. +We validated the m-rBCR model on four microscopy datasets - two simulated +microscopy datasets from ImageNet and BioSR, real dSTORM microscopy images, and +real widefield microscopy images. In contrast to the explicit deconvolution +methods (e.g. Richardson-Lucy) and other state-of-the-art NN models (U-Net, +DDPM, CARE, DnCNN, ESRGAN, RCAN, Noise2Noise, MPRNet, and MIMO-U-Net), the +m-rBCR model demonstrates superior performance to other candidates by PSNR and +SSIM in two real microscopy datasets and the simulated BioSR dataset. In the +simulated ImageNet dataset, m-rBCR ranks the second-best place (right after +MIMO-U-Net). With the backbone from the optical physics, m-rBCR exploits the +trainable parameters with better performances (from ~30 times fewer than the +benchmark MIMO-U-Net to ~210 times than ESRGAN). This enables m-rBCR to achieve +a shorter runtime (from ~3 times faster than MIMO-U-Net to ~300 times faster +than DDPM). To summarize, by leveraging physics constraints our model reduced +potentially redundant parameters significantly in expertise-oriented NN +candidates and achieved high efficiency with superior performance. + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ☆ MHNet: Multi-view High-order Network for Diagnosing Neurodevelopmental + Disorders Using Resting-state fMRI + + +
+ Background: Deep learning models have shown promise in diagnosing +neurodevelopmental disorders (NDD) like ASD and ADHD. However, many models +either use graph neural networks (GNN) to construct single-level brain +functional networks (BFNs) or employ spatial convolution filtering for local +information extraction from rs-fMRI data, often neglecting high-order features +crucial for NDD classification. Methods: We introduce a Multi-view High-order +Network (MHNet) to capture hierarchical and high-order features from multi-view +BFNs derived from rs-fMRI data for NDD prediction. MHNet has two branches: the +Euclidean Space Features Extraction (ESFE) module and the Non-Euclidean Space +Features Extraction (Non-ESFE) module, followed by a Feature Fusion-based +Classification (FFC) module for NDD identification. ESFE includes a Functional +Connectivity Generation (FCG) module and a High-order Convolutional Neural +Network (HCNN) module to extract local and high-order features from BFNs in +Euclidean space. Non-ESFE comprises a Generic Internet-like Brain Hierarchical +Network Generation (G-IBHN-G) module and a High-order Graph Neural Network +(HGNN) module to capture topological and high-order features in non-Euclidean +space. Results: Experiments on three public datasets show that MHNet +outperforms state-of-the-art methods using both AAL1 and Brainnetome Atlas +templates. Extensive ablation studies confirm the superiority of MHNet and the +effectiveness of using multi-view fMRI information and high-order features. Our +study also offers atlas options for constructing more sophisticated +hierarchical networks and explains the association between key brain regions +and NDD. Conclusion: MHNet leverages multi-view feature learning from both +Euclidean and non-Euclidean spaces, incorporating high-order information from +BFNs to enhance NDD classification performance. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Learning Disentangled Representation in Object-Centric Models for Visual + Dynamics Prediction via Transformers + + +
+ Recent work has shown that object-centric representations can greatly help +improve the accuracy of learning dynamics while also bringing interpretability. +In this work, we take this idea one step further, ask the following question: +"can learning disentangled representation further improve the accuracy of +visual dynamics prediction in object-centric models?" While there has been some +attempt to learn such disentangled representations for the case of static +images \citep{nsb}, to the best of our knowledge, ours is the first work which +tries to do this in a general setting for video, without making any specific +assumptions about the kind of attributes that an object might have. The key +building block of our architecture is the notion of a {\em block}, where +several blocks together constitute an object. Each block is represented as a +linear combination of a given number of learnable concept vectors, which is +iteratively refined during the learning process. The blocks in our model are +discovered in an unsupervised manner, by attending over object masks, in a +style similar to discovery of slots \citep{slot_attention}, for learning a +dense object-centric representation. We employ self-attention via transformers +over the discovered blocks to predict the next state resulting in discovery of +visual dynamics. We perform a series of experiments on several benchmark 2-D, +and 3-D datasets demonstrating that our architecture (1) can discover +semantically meaningful blocks (2) help improve accuracy of dynamics prediction +compared to SOTA object-centric models (3) perform significantly better in OOD +setting where the specific attribute combinations are not seen earlier during +training. Our experiments highlight the importance discovery of disentangled +representation for visual dynamics prediction. + +
+
+
+
+
+ + ☆ Category-Aware Dynamic Label Assignment with High-Quality Oriented + Proposal + + +
+ Objects in aerial images are typically embedded in complex backgrounds and +exhibit arbitrary orientations. When employing oriented bounding boxes (OBB) to +represent arbitrary oriented objects, the periodicity of angles could lead to +discontinuities in label regression values at the boundaries, inducing abrupt +fluctuations in the loss function. To address this problem, an OBB +representation based on the complex plane is introduced in the oriented +detection framework, and a trigonometric loss function is proposed. Moreover, +leveraging prior knowledge of complex background environments and significant +differences in large objects in aerial images, a conformer RPN head is +constructed to predict angle information. The proposed loss function and +conformer RPN head jointly generate high-quality oriented proposals. A +category-aware dynamic label assignment based on predicted category feedback is +proposed to address the limitations of solely relying on IoU for proposal label +assignment. This method makes negative sample selection more representative, +ensuring consistency between classification and regression features. +Experiments were conducted on four realistic oriented detection datasets, and +the results demonstrate superior performance in oriented object detection with +minimal parameter tuning and time costs. Specifically, mean average precision +(mAP) scores of 82.02%, 71.99%, 69.87%, and 98.77% were achieved on the +DOTA-v1.0, DOTA-v1.5, DIOR-R, and HRSC2016 datasets, respectively. + +
+
+
+
+
+ + ☆ Expressive Gaussian Human Avatars from Monocular RGB Video + + +
+ Nuanced expressiveness, particularly through fine-grained hand and facial +expressions, is pivotal for enhancing the realism and vitality of digital human +representations. In this work, we focus on investigating the expressiveness of +human avatars when learned from monocular RGB video; a setting that introduces +new challenges in capturing and animating fine-grained details. To this end, we +introduce EVA, a drivable human model that meticulously sculpts fine details +based on 3D Gaussians and SMPL-X, an expressive parametric human model. Focused +on enhancing expressiveness, our work makes three key contributions. First, we +highlight the critical importance of aligning the SMPL-X model with RGB frames +for effective avatar learning. Recognizing the limitations of current SMPL-X +prediction methods for in-the-wild videos, we introduce a plug-and-play module +that significantly ameliorates misalignment issues. Second, we propose a +context-aware adaptive density control strategy, which is adaptively adjusting +the gradient thresholds to accommodate the varied granularity across body +parts. Last but not least, we develop a feedback mechanism that predicts +per-pixel confidence to better guide the learning of 3D Gaussians. Extensive +experiments on two benchmarks demonstrate the superiority of our framework both +quantitatively and qualitatively, especially on the fine-grained hand and +facial details. See the project website at \url{https://evahuman.github.io} + +
+
+
+
+
+ + ☆ SegVG: Transferring Object Bounding Box to Segmentation for Visual + Grounding ECCV 2024 + + +
+ Different from Object Detection, Visual Grounding deals with detecting a +bounding box for each text-image pair. This one box for each text-image data +provides sparse supervision signals. Although previous works achieve impressive +results, their passive utilization of annotation, i.e. the sole use of the box +annotation as regression ground truth, results in a suboptimal performance. In +this paper, we present SegVG, a novel method transfers the box-level annotation +as Segmentation signals to provide an additional pixel-level supervision for +Visual Grounding. Specifically, we propose the Multi-layer Multi-task +Encoder-Decoder as the target grounding stage, where we learn a regression +query and multiple segmentation queries to ground the target by regression and +segmentation of the box in each decoding layer, respectively. This approach +allows us to iteratively exploit the annotation as signals for both box-level +regression and pixel-level segmentation. Moreover, as the backbones are +typically initialized by pretrained parameters learned from unimodal tasks and +the queries for both regression and segmentation are static learnable +embeddings, a domain discrepancy remains among these three types of features, +which impairs subsequent target grounding. To mitigate this discrepancy, we +introduce the Triple Alignment module, where the query, text, and vision tokens +are triangularly updated to share the same space by triple attention mechanism. +Extensive experiments on five widely used datasets validate our +state-of-the-art (SOTA) performance. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ DyFADet: Dynamic Feature Aggregation for Temporal Action Detection ECCV 2024 + + +
+ Recent proposed neural network-based Temporal Action Detection (TAD) models +are inherently limited to extracting the discriminative representations and +modeling action instances with various lengths from complex scenes by +shared-weights detection heads. Inspired by the successes in dynamic neural +networks, in this paper, we build a novel dynamic feature aggregation (DFA) +module that can simultaneously adapt kernel weights and receptive fields at +different timestamps. Based on DFA, the proposed dynamic encoder layer +aggregates the temporal features within the action time ranges and guarantees +the discriminability of the extracted representations. Moreover, using DFA +helps to develop a Dynamic TAD head (DyHead), which adaptively aggregates the +multi-scale features with adjusted parameters and learned receptive fields +better to detect the action instances with diverse ranges from videos. With the +proposed encoder layer and DyHead, a new dynamic TAD model, DyFADet, achieves +promising performance on a series of challenging TAD benchmarks, including +HACS-Segment, THUMOS14, ActivityNet-1.3, Epic-Kitchen 100, Ego4D-Moment +QueriesV1.0, and FineAction. Code is released to +https://github.com/yangle15/DyFADet-pytorch. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Motion meets Attention: Video Motion Prompts + + +
+ Videos contain rich spatio-temporal information. Traditional methods for +extracting motion, used in tasks such as action recognition, often rely on +visual contents rather than precise motion features. This phenomenon is +referred to as 'blind motion extraction' behavior, which proves inefficient in +capturing motions of interest due to a lack of motion-guided cues. Recently, +attention mechanisms have enhanced many computer vision tasks by effectively +highlighting salient visual areas. Inspired by this, we propose using a +modified Sigmoid function with learnable slope and shift parameters as an +attention mechanism to activate and modulate motion signals derived from frame +differencing maps. This approach generates a sequence of attention maps that +enhance the processing of motion-related video content. To ensure temporally +continuity and smoothness of the attention maps, we apply pair-wise temporal +attention variation regularization to remove unwanted motions (e.g., noise) +while preserving important ones. We then perform Hadamard product between each +pair of attention maps and the original video frames to highlight the evolving +motions of interest over time. These highlighted motions, termed video motion +prompts, are subsequently used as inputs to the model instead of the original +video frames. We formalize this process as a motion prompt layer and +incorporate the regularization term into the loss function to learn better +motion prompts. This layer serves as an adapter between the model and the video +data, bridging the gap between traditional 'blind motion extraction' and the +extraction of relevant motions of interest. + +
+
+ comment: Research report +
+
+
+
+
+ + ☆ Relating CNN-Transformer Fusion Network for Change Detection + + +
+ While deep learning, particularly convolutional neural networks (CNNs), has +revolutionized remote sensing (RS) change detection (CD), existing approaches +often miss crucial features due to neglecting global context and incomplete +change learning. Additionally, transformer networks struggle with low-level +details. RCTNet addresses these limitations by introducing \textbf{(1)} an +early fusion backbone to exploit both spatial and temporal features early on, +\textbf{(2)} a Cross-Stage Aggregation (CSA) module for enhanced temporal +representation, \textbf{(3)} a Multi-Scale Feature Fusion (MSF) module for +enriched feature extraction in the decoder, and \textbf{(4)} an Efficient +Self-deciphering Attention (ESA) module utilizing transformers to capture +global information and fine-grained details for accurate change detection. +Extensive experiments demonstrate RCTNet's clear superiority over traditional +RS image CD methods, showing significant improvement and an optimal balance +between accuracy and computational cost. + +
+
+ comment: accepted by IEEE Conference on Multimedia Expo +
+
+
+
+
+ + ☆ IMC 2024 Methods & Solutions Review + + +
+ For the past three years, Kaggle has been hosting the Image Matching +Challenge, which focuses on solving a 3D image reconstruction problem using a +collection of 2D images. Each year, this competition fosters the development of +innovative and effective methodologies by its participants. In this paper, we +introduce an advanced ensemble technique that we developed, achieving a score +of 0.153449 on the private leaderboard and securing the 160th position out of +over 1,000 participants. Additionally, we conduct a comprehensive review of +existing methods and techniques employed by top-performing teams in the +competition. Our solution, alongside the insights gathered from other leading +approaches, contributes to the ongoing advancement in the field of 3D image +reconstruction. This research provides valuable knowledge for future +participants and researchers aiming to excel in similar image matching and +reconstruction challenges. + +
+
+ comment: 8 Pages, 9 figures +
+
+
+
+
+ + ☆ LivePortrait: Efficient Portrait Animation with Stitching and + Retargeting Control + + +
+ Portrait Animation aims to synthesize a lifelike video from a single source +image, using it as an appearance reference, with motion (i.e., facial +expressions and head pose) derived from a driving video, audio, text, or +generation. Instead of following mainstream diffusion-based methods, we explore +and extend the potential of the implicit-keypoint-based framework, which +effectively balances computational efficiency and controllability. Building +upon this, we develop a video-driven portrait animation framework named +LivePortrait with a focus on better generalization, controllability, and +efficiency for practical usage. To enhance the generation quality and +generalization ability, we scale up the training data to about 69 million +high-quality frames, adopt a mixed image-video training strategy, upgrade the +network architecture, and design better motion transformation and optimization +objectives. Additionally, we discover that compact implicit keypoints can +effectively represent a kind of blendshapes and meticulously propose a +stitching and two retargeting modules, which utilize a small MLP with +negligible computational overhead, to enhance the controllability. Experimental +results demonstrate the efficacy of our framework even compared to +diffusion-based methods. The generation speed remarkably reaches 12.8ms on an +RTX 4090 GPU with PyTorch. The inference code and models are available at +https://github.com/KwaiVGI/LivePortrait + +
+
+
+
+
+ + ☆ Consistent Point Orientation for Manifold Surfaces via Boundary + Integration + + +
+ This paper introduces a new approach for generating globally consistent +normals for point clouds sampled from manifold surfaces. Given that the +generalized winding number (GWN) field generated by a point cloud with globally +consistent normals is a solution to a PDE with jump boundary conditions and +possesses harmonic properties, and the Dirichlet energy of the GWN field can be +defined as an integral over the boundary surface, we formulate a boundary +energy derived from the Dirichlet energy of the GWN. Taking as input a point +cloud with randomly oriented normals, we optimize this energy to restore the +global harmonicity of the GWN field, thereby recovering the globally consistent +normals. Experiments show that our method outperforms state-of-the-art +approaches, exhibiting enhanced robustness to noise, outliers, complex +topologies, and thin structures. Our code can be found at +\url{https://github.com/liuweizhou319/BIM}. + +
+
+ comment: accepted in siggraph2024 +
+
+
+
+
+ + ☆ Global Context Modeling in YOLOv8 for Pediatric Wrist Fracture Detection + + +
+ Children often suffer wrist injuries in daily life, while fracture injuring +radiologists usually need to analyze and interpret X-ray images before surgical +treatment by surgeons. The development of deep learning has enabled neural +network models to work as computer-assisted diagnosis (CAD) tools to help +doctors and experts in diagnosis. Since the YOLOv8 models have obtained the +satisfactory success in object detection tasks, it has been applied to fracture +detection. The Global Context (GC) block effectively models the global context +in a lightweight way, and incorporating it into YOLOv8 can greatly improve the +model performance. This paper proposes the YOLOv8+GC model for fracture +detection, which is an improved version of the YOLOv8 model with the GC block. +Experimental results demonstrate that compared to the original YOLOv8 model, +the proposed YOLOv8-GC model increases the mean average precision calculated at +intersection over union threshold of 0.5 (mAP 50) from 63.58% to 66.32% on the +GRAZPEDWRI-DX dataset, achieving the state-of-the-art (SOTA) level. The +implementation code for this work is available on GitHub at +https://github.com/RuiyangJu/YOLOv8_Global_Context_Fracture_Detection. + +
+
+
+
+
+ + ☆ Bunny-VisionPro: Real-Time Bimanual Dexterous Teleoperation for + Imitation Learning + + +
+ Teleoperation is a crucial tool for collecting human demonstrations, but +controlling robots with bimanual dexterous hands remains a challenge. Existing +teleoperation systems struggle to handle the complexity of coordinating two +hands for intricate manipulations. We introduce Bunny-VisionPro, a real-time +bimanual dexterous teleoperation system that leverages a VR headset. Unlike +previous vision-based teleoperation systems, we design novel low-cost devices +to provide haptic feedback to the operator, enhancing immersion. Our system +prioritizes safety by incorporating collision and singularity avoidance while +maintaining real-time performance through innovative designs. Bunny-VisionPro +outperforms prior systems on a standard task suite, achieving higher success +rates and reduced task completion times. Moreover, the high-quality +teleoperation demonstrations improve downstream imitation learning performance, +leading to better generalizability. Notably, Bunny-VisionPro enables imitation +learning with challenging multi-stage, long-horizon dexterous manipulation +tasks, which have rarely been addressed in previous work. Our system's ability +to handle bimanual manipulations while prioritizing safety and real-time +performance makes it a powerful tool for advancing dexterous manipulation and +imitation learning. + +
+
+ comment: project page: https://dingry.github.io/projects/bunny_visionpro.html +
+
+
+
+
+ + ☆ Stereo Risk: A Continuous Modeling Approach to Stereo Matching ICML 2024 + + +
+ We introduce Stereo Risk, a new deep-learning approach to solve the classical +stereo-matching problem in computer vision. As it is well-known that stereo +matching boils down to a per-pixel disparity estimation problem, the popular +state-of-the-art stereo-matching approaches widely rely on regressing the scene +disparity values, yet via discretization of scene disparity values. Such +discretization often fails to capture the nuanced, continuous nature of scene +depth. Stereo Risk departs from the conventional discretization approach by +formulating the scene disparity as an optimal solution to a continuous risk +minimization problem, hence the name "stereo risk". We demonstrate that $L^1$ +minimization of the proposed continuous risk function enhances stereo-matching +performance for deep networks, particularly for disparities with multi-modal +probability distributions. Furthermore, to enable the end-to-end network +training of the non-differentiable $L^1$ risk optimization, we exploited the +implicit function theorem, ensuring a fully differentiable network. A +comprehensive analysis demonstrates our method's theoretical soundness and +superior performance over the state-of-the-art methods across various benchmark +datasets, including KITTI 2012, KITTI 2015, ETH3D, SceneFlow, and Middlebury +2014. + +
+
+ comment: Accepted as an Oral Paper at ICML 2024. Draft info: 18 pages, 6 + Figure, 16 Tables +
+
+
+
+
+ + ☆ Venomancer: Towards Imperceptible and Target-on-Demand Backdoor Attacks + in Federated Learning + + +
+ Federated Learning (FL) is a distributed machine learning approach that +maintains data privacy by training on decentralized data sources. Similar to +centralized machine learning, FL is also susceptible to backdoor attacks. Most +backdoor attacks in FL assume a predefined target class and require control +over a large number of clients or knowledge of benign clients' information. +Furthermore, they are not imperceptible and are easily detected by human +inspection due to clear artifacts left on the poison data. To overcome these +challenges, we propose Venomancer, an effective backdoor attack that is +imperceptible and allows target-on-demand. Specifically, imperceptibility is +achieved by using a visual loss function to make the poison data visually +indistinguishable from the original data. Target-on-demand property allows the +attacker to choose arbitrary target classes via conditional adversarial +training. Additionally, experiments showed that the method is robust against +state-of-the-art defenses such as Norm Clipping, Weak DP, Krum, and Multi-Krum. +The source code is available at +https://anonymous.4open.science/r/Venomancer-3426. + +
+
+
+
+
+ + ☆ Machine Learning Models for Improved Tracking from Range-Doppler Map + Images + + +
+ Statistical tracking filters depend on accurate target measurements and +uncertainty estimates for good tracking performance. In this work, we propose +novel machine learning models for target detection and uncertainty estimation +in range-Doppler map (RDM) images for Ground Moving Target Indicator (GMTI) +radars. We show that by using the outputs of these models, we can significantly +improve the performance of a multiple hypothesis tracker for complex +multi-target air-to-ground tracking scenarios. + +
+
+
+
+
+ + ☆ Towards Efficient Pixel Labeling for Industrial Anomaly Detection and + Localization + + +
+ In the realm of practical Anomaly Detection (AD) tasks, manual labeling of +anomalous pixels proves to be a costly endeavor. Consequently, many AD methods +are crafted as one-class classifiers, tailored for training sets completely +devoid of anomalies, ensuring a more cost-effective approach. While some +pioneering work has demonstrated heightened AD accuracy by incorporating real +anomaly samples in training, this enhancement comes at the price of +labor-intensive labeling processes. This paper strikes the balance between AD +accuracy and labeling expenses by introducing ADClick, a novel Interactive +Image Segmentation (IIS) algorithm. ADClick efficiently generates +"ground-truth" anomaly masks for real defective images, leveraging innovative +residual features and meticulously crafted language prompts. Notably, ADClick +showcases a significantly elevated generalization capacity compared to existing +state-of-the-art IIS approaches. Functioning as an anomaly labeling tool, +ADClick generates high-quality anomaly labels (AP $= 94.1\%$ on MVTec AD) based +on only $3$ to $5$ manual click annotations per training image. Furthermore, we +extend the capabilities of ADClick into ADClick-Seg, an enhanced model designed +for anomaly detection and localization. By fine-tuning the ADClick-Seg model +using the weak labels inferred by ADClick, we establish the state-of-the-art +performances in supervised AD tasks (AP $= 86.4\%$ on MVTec AD and AP $= +78.4\%$, PRO $= 98.6\%$ on KSDD2). + +
+
+ comment: 18 pages, 5 figures +
+
+
+
+
+ + ☆ $L_p$-norm Distortion-Efficient Adversarial Attack + + +
+ Adversarial examples have shown a powerful ability to make a well-trained +model misclassified. Current mainstream adversarial attack methods only +consider one of the distortions among $L_0$-norm, $L_2$-norm, and +$L_\infty$-norm. $L_0$-norm based methods cause large modification on a single +pixel, resulting in naked-eye visible detection, while $L_2$-norm and +$L_\infty$-norm based methods suffer from weak robustness against adversarial +defense since they always diffuse tiny perturbations to all pixels. A more +realistic adversarial perturbation should be sparse and imperceptible. In this +paper, we propose a novel $L_p$-norm distortion-efficient adversarial attack, +which not only owns the least $L_2$-norm loss but also significantly reduces +the $L_0$-norm distortion. To this aim, we design a new optimization scheme, +which first optimizes an initial adversarial perturbation under $L_2$-norm +constraint, and then constructs a dimension unimportance matrix for the initial +perturbation. Such a dimension unimportance matrix can indicate the adversarial +unimportance of each dimension of the initial perturbation. Furthermore, we +introduce a new concept of adversarial threshold for the dimension unimportance +matrix. The dimensions of the initial perturbation whose unimportance is higher +than the threshold will be all set to zero, greatly decreasing the $L_0$-norm +distortion. Experimental results on three benchmark datasets show that under +the same query budget, the adversarial examples generated by our method have +lower $L_0$-norm and $L_2$-norm distortion than the state-of-the-art. +Especially for the MNIST dataset, our attack reduces 8.1$\%$ $L_2$-norm +distortion meanwhile remaining 47$\%$ pixels unattacked. This demonstrates the +superiority of the proposed method over its competitors in terms of adversarial +robustness and visual imperceptibility. + +
+
+
+
+
+ + ☆ Anti-Collapse Loss for Deep Metric Learning Based on Coding Rate Metric + + +
+ Deep metric learning (DML) aims to learn a discriminative high-dimensional +embedding space for downstream tasks like classification, clustering, and +retrieval. Prior literature predominantly focuses on pair-based and proxy-based +methods to maximize inter-class discrepancy and minimize intra-class diversity. +However, these methods tend to suffer from the collapse of the embedding space +due to their over-reliance on label information. This leads to sub-optimal +feature representation and inferior model performance. To maintain the +structure of embedding space and avoid feature collapse, we propose a novel +loss function called Anti-Collapse Loss. Specifically, our proposed loss +primarily draws inspiration from the principle of Maximal Coding Rate +Reduction. It promotes the sparseness of feature clusters in the embedding +space to prevent collapse by maximizing the average coding rate of sample +features or class proxies. Moreover, we integrate our proposed loss with +pair-based and proxy-based methods, resulting in notable performance +improvement. Comprehensive experiments on benchmark datasets demonstrate that +our proposed method outperforms existing state-of-the-art methods. Extensive +ablation studies verify the effectiveness of our method in preventing embedding +space collapse and promoting generalization performance. + +
+
+ comment: accepted by IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ KeyVideoLLM: Towards Large-scale Video Keyframe Selection + + +
+ Recently, with the rise of web videos, managing and understanding large-scale +video datasets has become increasingly important. Video Large Language Models +(VideoLLMs) have emerged in recent years due to their strong video +understanding capabilities. However, training and inference processes for +VideoLLMs demand vast amounts of data, presenting significant challenges to +data management, particularly regarding efficiency, robustness, and +effectiveness. In this work, we present KeyVideoLLM, a text-video frame +similarity-based keyframe selection method designed to manage VideoLLM data +efficiently, robustly, and effectively. Specifically, KeyVideoLLM achieves a +remarkable data compression rate of up to 60.9 times, substantially lowering +disk space requirements, which proves its high efficiency. Additionally, it +maintains a 100% selection success rate across all video formats and scales, +enhances processing speed by up to 200 times compared to existing keyframe +selection methods, and does not require hyperparameter tuning. Beyond its +outstanding efficiency and robustness, KeyVideoLLM further improves model +performance in video question-answering tasks during both training and +inference stages. Notably, it consistently achieved the state-of-the-art (SoTA) +experimental results on diverse datasets. + +
+
+
+
+
+ + ☆ Improving Zero-shot Generalization of Learned Prompts via Unsupervised + Knowledge Distillation ECCV24 + + +
+ Vision-Language Models (VLMs) demonstrate remarkable zero-shot generalization +to unseen tasks, but fall short of the performance of supervised methods in +generalizing to downstream tasks with limited data. Prompt learning is emerging +as a parameter-efficient method for adapting VLMs, but state-of-the-art +approaches require annotated samples. In this paper we propose a novel approach +to prompt learning based on unsupervised knowledge distillation from more +powerful models. Our approach, which we call Knowledge Distillation Prompt +Learning (KDPL), can be integrated into existing prompt learning techniques and +eliminates the need for labeled examples during adaptation. Our experiments on +more than ten standard benchmark datasets demonstrate that KDPL is very +effective at improving generalization of learned prompts for zero-shot domain +generalization, zero-shot cross-dataset generalization, and zero-shot +base-to-novel class generalization problems. KDPL requires no ground-truth +labels for adaptation, and moreover we show that even in the absence of any +knowledge of training class names it can be used to effectively transfer +knowledge. The code is publicly available at https://github.com/miccunifi/KDPL. + +
+
+ comment: Accepted for publication at ECCV24 +
+
+
+
+
+ + ☆ SlerpFace: Face Template Protection via Spherical Linear Interpolation + + +
+ Contemporary face recognition systems use feature templates extracted from +face images to identify persons. To enhance privacy, face template protection +techniques are widely employed to conceal sensitive identity and appearance +information stored in the template. This paper identifies an emerging privacy +attack form utilizing diffusion models that could nullify prior protection, +referred to as inversion attacks. The attack can synthesize high-quality, +identity-preserving face images from templates, revealing persons' appearance. +Based on studies of the diffusion model's generative capability, this paper +proposes a defense to deteriorate the attack, by rotating templates to a +noise-like distribution. This is achieved efficiently by spherically and +linearly interpolating templates, or slerp, on their located hypersphere. This +paper further proposes to group-wisely divide and drop out templates' feature +dimensions, to enhance the irreversibility of rotated templates. The division +of groups and dropouts within each group are learned in a recognition-favored +way. The proposed techniques are concretized as a novel face template +protection technique, SlerpFace. Extensive experiments show that SlerpFace +provides satisfactory recognition accuracy and comprehensive privacy protection +against inversion and other attack forms, superior to prior arts. + +
+
+ comment: face template protection +
+
+
+
+
+ + ☆ Position and Altitude of the Nao Camera Head from Two Points on the + Soccer Field plus the Gravitational Direction + + +
+ To be able to play soccer, a robot needs a good estimate of its current +position on the field. Ideally, multiple features are visible that have known +locations. By applying trigonometry we can estimate the viewpoint from where +this observation was actually made. Given that the Nao robots of the Standard +Platform League have quite a limited field of view, a given camera frame +typically only allows for one or two points to be recognized. + In this paper we propose a method for determining the (x, y) coordinates on +the field and the height h of the camera from the geometry of a simplified +tetrahedron. This configuration is formed by two observed points on the ground +plane plus the gravitational direction. When the distance between the two +points is known, and the directions to the points plus the gravitational +direction are measured, all dimensions of the tetrahedron can be determined. + By performing these calculations with rational trigonometry instead of +classical trigonometry, the computations turn out to be 28.7% faster, with +equal numerical accuracy. The position of the head of the Nao can also be +externally measured with the OptiTrack system. The difference between +externally measured and internally predicted position from sensor data gives us +mean absolute errors in the 3-6 centimeters range, when we estimated the +gravitational direction from the vanishing point of the outer edges of the goal +posts. + +
+
+ comment: to be published in the Proceedings of the RoboCup 2024 symposium - 12 + pages +
+
+
+
+
+ + ☆ SAFT: Towards Out-of-Distribution Generalization in Fine-Tuning + + +
+ Handling distribution shifts from training data, known as out-of-distribution +(OOD) generalization, poses a significant challenge in the field of machine +learning. While a pre-trained vision-language model like CLIP has demonstrated +remarkable zero-shot performance, further adaptation of the model to downstream +tasks leads to undesirable degradation for OOD data. In this work, we introduce +Sparse Adaptation for Fine-Tuning (SAFT), a method that prevents fine-tuning +from forgetting the general knowledge in the pre-trained model. SAFT only +updates a small subset of important parameters whose gradient magnitude is +large, while keeping the other parameters frozen. SAFT is straightforward to +implement and conceptually simple. Extensive experiments show that with only +0.1% of the model parameters, SAFT can significantly improve the performance of +CLIP. It consistently outperforms baseline methods across several benchmarks. +On the few-shot learning benchmark of ImageNet and its variants, SAFT gives a +gain of 5.15% on average over the conventional fine-tuning method in OOD +settings. + +
+
+
+
+
+ + ☆ ISWSST: Index-space-wave State Superposition Transformers for + Multispectral Remotely Sensed Imagery Semantic Segmentation + + +
+ Currently the semantic segmentation task of multispectral remotely sensed +imagery (MSRSI) faces the following problems: 1) Usually, only single domain +feature (i.e., space domain or frequency domain) is considered; 2) downsampling +operation in encoder generally leads to the accuracy loss of edge extraction; +3) multichannel features of MSRSI are not fully considered; and 4) prior +knowledge of remote sensing is not fully utilized. To solve the aforementioned +issues, an index-space-wave state superposition Transformer (ISWSST) is the +first to be proposed for MSRSI semantic segmentation by the inspiration from +quantum mechanics, whose superiority is as follows: 1) index, space and wave +states are superposed or fused to simulate quantum superposition by adaptively +voting decision (i.e., ensemble learning idea) for being a stronger classifier +and improving the segmentation accuracy; 2) a lossless wavelet pyramid +encoder-decoder module is designed to losslessly reconstruct image and simulate +quantum entanglement based on wavelet transform and inverse wavelet transform +for avoiding the edge extraction loss; 3) combining multispectral features +(i.e. remote sensing index and channel attention mechanism) is proposed to +accurately extract ground objects from original resolution images; and 4) +quantum mechanics are introduced to interpret the underlying superiority of +ISWSST. Experiments show that ISWSST is validated and superior to the +state-of-the-art architectures for the MSRSI segmentation task, which improves +the segmentation and edge extraction accuracy effectively. Codes will be +available publicly after our paper is accepted. + +
+
+
+
+
+ + ☆ An Organism Starts with a Single Pix-Cell: A Neural Cellular Diffusion + for High-Resolution Image Synthesis MICCAI 2024 + + +
+ Generative modeling seeks to approximate the statistical properties of real +data, enabling synthesis of new data that closely resembles the original +distribution. Generative Adversarial Networks (GANs) and Denoising Diffusion +Probabilistic Models (DDPMs) represent significant advancements in generative +modeling, drawing inspiration from game theory and thermodynamics, +respectively. Nevertheless, the exploration of generative modeling through the +lens of biological evolution remains largely untapped. In this paper, we +introduce a novel family of models termed Generative Cellular Automata (GeCA), +inspired by the evolution of an organism from a single cell. GeCAs are +evaluated as an effective augmentation tool for retinal disease classification +across two imaging modalities: Fundus and Optical Coherence Tomography (OCT). +In the context of OCT imaging, where data is scarce and the distribution of +classes is inherently skewed, GeCA significantly boosts the performance of 11 +different ophthalmological conditions, achieving a 12% increase in the average +F1 score compared to conventional baselines. GeCAs outperform both diffusion +methods that incorporate UNet or state-of-the art variants with +transformer-based denoising models, under similar parameter constraints. Code +is available at: https://github.com/xmed-lab/GeCA. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ☆ Context-Aware Video Instance Segmentation + + +
+ In this paper, we introduce the Context-Aware Video Instance Segmentation +(CAVIS), a novel framework designed to enhance instance association by +integrating contextual information adjacent to each object. To efficiently +extract and leverage this information, we propose the Context-Aware Instance +Tracker (CAIT), which merges contextual data surrounding the instances with the +core instance features to improve tracking accuracy. Additionally, we introduce +the Prototypical Cross-frame Contrastive (PCC) loss, which ensures consistency +in object-level features across frames, thereby significantly enhancing +instance matching accuracy. CAVIS demonstrates superior performance over +state-of-the-art methods on all benchmark datasets in video instance +segmentation (VIS) and video panoptic segmentation (VPS). Notably, our method +excels on the OVIS dataset, which is known for its particularly challenging +videos. + +
+
+ comment: Project page: https://seung-hun-lee.github.io/projects/CAVIS/ +
+
+
+
+
+ + ☆ Model Guidance via Explanations Turns Image Classifiers into + Segmentation Models + + +
+ Heatmaps generated on inputs of image classification networks via explainable +AI methods like Grad-CAM and LRP have been observed to resemble segmentations +of input images in many cases. Consequently, heatmaps have also been leveraged +for achieving weakly supervised segmentation with image-level supervision. On +the other hand, losses can be imposed on differentiable heatmaps, which has +been shown to serve for (1)~improving heatmaps to be more human-interpretable, +(2)~regularization of networks towards better generalization, (3)~training +diverse ensembles of networks, and (4)~for explicitly ignoring confounding +input features. Due to the latter use case, the paradigm of imposing losses on +heatmaps is often referred to as "Right for the right reasons". We unify these +two lines of research by investigating semi-supervised segmentation as a novel +use case for the Right for the Right Reasons paradigm. First, we show formal +parallels between differentiable heatmap architectures and standard +encoder-decoder architectures for image segmentation. Second, we show that such +differentiable heatmap architectures yield competitive results when trained +with standard segmentation losses. Third, we show that such architectures allow +for training with weak supervision in the form of image-level labels and small +numbers of pixel-level labels, outperforming comparable encoder-decoder models. +Code is available at \url{https://github.com/Kainmueller-Lab/TW-autoencoder}. + +
+
+
+
+
+ + ☆ Align and Aggregate: Compositional Reasoning with Video Alignment and + Answer Aggregation for Video Question-Answering CVPR + + +
+ Despite the recent progress made in Video Question-Answering (VideoQA), these +methods typically function as black-boxes, making it difficult to understand +their reasoning processes and perform consistent compositional reasoning. To +address these challenges, we propose a \textit{model-agnostic} Video Alignment +and Answer Aggregation (VA$^{3}$) framework, which is capable of enhancing both +compositional consistency and accuracy of existing VidQA methods by integrating +video aligner and answer aggregator modules. The video aligner hierarchically +selects the relevant video clips based on the question, while the answer +aggregator deduces the answer to the question based on its sub-questions, with +compositional consistency ensured by the information flow along question +decomposition graph and the contrastive learning strategy. We evaluate our +framework on three settings of the AGQA-Decomp dataset with three baseline +methods, and propose new metrics to measure the compositional consistency of +VidQA methods more comprehensively. Moreover, we propose a large language model +(LLM) based automatic question decomposition pipeline to apply our framework to +any VidQA dataset. We extend MSVD and NExT-QA datasets with it to evaluate our +VA$^3$ framework on broader scenarios. Extensive experiments show that our +framework improves both compositional consistency and accuracy of existing +methods, leading to more interpretable real-world VidQA models. + +
+
+ comment: 10 pages,CVPR +
+
+
+
+
+ + ☆ Frequency-Controlled Diffusion Model for Versatile Text-Guided + Image-to-Image Translation AAAI + + +
+ Recently, large-scale text-to-image (T2I) diffusion models have emerged as a +powerful tool for image-to-image translation (I2I), allowing open-domain image +translation via user-provided text prompts. This paper proposes +frequency-controlled diffusion model (FCDiffusion), an end-to-end +diffusion-based framework that contributes a novel solution to text-guided I2I +from a frequency-domain perspective. At the heart of our framework is a +feature-space frequency-domain filtering module based on Discrete Cosine +Transform, which filters the latent features of the source image in the DCT +domain, yielding filtered image features bearing different DCT spectral bands +as different control signals to the pre-trained Latent Diffusion Model. We +reveal that control signals of different DCT spectral bands bridge the source +image and the T2I generated image in different correlations (e.g., style, +structure, layout, contour, etc.), and thus enable versatile I2I applications +emphasizing different I2I correlations, including style-guided content +creation, image semantic manipulation, image scene translation, and image style +translation. Different from related approaches, FCDiffusion establishes a +unified text-guided I2I framework suitable for diverse image translation tasks +simply by switching among different frequency control branches at inference +time. The effectiveness and superiority of our method for text-guided I2I are +demonstrated with extensive experiments both qualitatively and quantitatively. +The code is publicly available at: https://github.com/XiangGao1102/FCDiffusion. + +
+
+ comment: Proceedings of the 38th AAAI Conference on Artificial Intelligence + (AAAI 2024) +
+
+
+
+
+ + ☆ VIVA: A Benchmark for Vision-Grounded Decision-Making with Human Values + + +
+ This paper introduces VIVA, a benchmark for VIsion-grounded decision-making +driven by human VAlues. While most large vision-language models (VLMs) focus on +physical-level skills, our work is the first to examine their multimodal +capabilities in leveraging human values to make decisions under a +vision-depicted situation. VIVA contains 1,062 images depicting diverse +real-world situations and the manually annotated decisions grounded in them. +Given an image there, the model should select the most appropriate action to +address the situation and provide the relevant human values and reason +underlying the decision. Extensive experiments based on VIVA show the +limitation of VLMs in using human values to make multimodal decisions. Further +analyses indicate the potential benefits of exploiting action consequences and +predicted human values. + +
+
+
+
+
+ + ☆ Graph and Skipped Transformer: Exploiting Spatial and Temporal Modeling + Capacities for Efficient 3D Human Pose Estimation + + +
+ In recent years, 2D-to-3D pose uplifting in monocular 3D Human Pose +Estimation (HPE) has attracted widespread research interest. GNN-based methods +and Transformer-based methods have become mainstream architectures due to their +advanced spatial and temporal feature learning capacities. However, existing +approaches typically construct joint-wise and frame-wise attention alignments +in spatial and temporal domains, resulting in dense connections that introduce +considerable local redundancy and computational overhead. In this paper, we +take a global approach to exploit spatio-temporal information and realise +efficient 3D HPE with a concise Graph and Skipped Transformer architecture. +Specifically, in Spatial Encoding stage, coarse-grained body parts are deployed +to construct Spatial Graph Network with a fully data-driven adaptive topology, +ensuring model flexibility and generalizability across various poses. In +Temporal Encoding and Decoding stages, a simple yet effective Skipped +Transformer is proposed to capture long-range temporal dependencies and +implement hierarchical feature aggregation. A straightforward Data Rolling +strategy is also developed to introduce dynamic information into 2D pose +sequence. Extensive experiments are conducted on Human3.6M, MPI-INF-3DHP and +Human-Eva benchmarks. G-SFormer series methods achieve superior performances +compared with previous state-of-the-arts with only around ten percent of +parameters and significantly reduced computational complexity. Additionally, +G-SFormer also exhibits outstanding robustness to inaccuracies in detected 2D +poses. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ YOLOv5, YOLOv8 and YOLOv10: The Go-To Detectors for Real-time Vision + + +
+ This paper presents a comprehensive review of the evolution of the YOLO (You +Only Look Once) object detection algorithm, focusing on YOLOv5, YOLOv8, and +YOLOv10. We analyze the architectural advancements, performance improvements, +and suitability for edge deployment across these versions. YOLOv5 introduced +significant innovations such as the CSPDarknet backbone and Mosaic +Augmentation, balancing speed and accuracy. YOLOv8 built upon this foundation +with enhanced feature extraction and anchor-free detection, improving +versatility and performance. YOLOv10 represents a leap forward with NMS-free +training, spatial-channel decoupled downsampling, and large-kernel +convolutions, achieving state-of-the-art performance with reduced computational +overhead. Our findings highlight the progressive enhancements in accuracy, +efficiency, and real-time performance, particularly emphasizing their +applicability in resource-constrained environments. This review provides +insights into the trade-offs between model complexity and detection accuracy, +offering guidance for selecting the most appropriate YOLO version for specific +edge computing applications. + +
+
+
+
+
+ + ☆ IM-MoCo: Self-supervised MRI Motion Correction using Motion-Guided + Implicit Neural Representations MICCAI 2024 + + +
+ Motion artifacts in Magnetic Resonance Imaging (MRI) arise due to relatively +long acquisition times and can compromise the clinical utility of acquired +images. Traditional motion correction methods often fail to address severe +motion, leading to distorted and unreliable results. Deep Learning (DL) +alleviated such pitfalls through generalization with the cost of vanishing +structures and hallucinations, making it challenging to apply in the medical +field where hallucinated structures can tremendously impact the diagnostic +outcome. In this work, we present an instance-wise motion correction pipeline +that leverages motion-guided Implicit Neural Representations (INRs) to mitigate +the impact of motion artifacts while retaining anatomical structure. Our method +is evaluated using the NYU fastMRI dataset with different degrees of simulated +motion severity. For the correction alone, we can improve over state-of-the-art +image reconstruction methods by $+5\%$ SSIM, $+5\:db$ PSNR, and $+14\%$ +HaarPSI. Clinical relevance is demonstrated by a subsequent experiment, where +our method improves classification outcomes by at least $+1.5$ accuracy +percentage points compared to motion-corrupted images. + +
+
+ comment: Submitted to MICCAI 2024 (Before peer review version) +
+
+
+
+
+ + ☆ Unified Anomaly Detection methods on Edge Device using Knowledge + Distillation and Quantization + + +
+ With the rapid advances in deep learning and smart manufacturing in Industry +4.0, there is an imperative for high-throughput, high-performance, and fully +integrated visual inspection systems. Most anomaly detection approaches using +defect detection datasets, such as MVTec AD, employ one-class models that +require fitting separate models for each class. On the contrary, unified models +eliminate the need for fitting separate models for each class and significantly +reduce cost and memory requirements. Thus, in this work, we experiment with +considering a unified multi-class setup. Our experimental study shows that +multi-class models perform at par with one-class models for the standard MVTec +AD dataset. Hence, this indicates that there may not be a need to learn +separate object/class-wise models when the object classes are significantly +different from each other, as is the case of the dataset considered. +Furthermore, we have deployed three different unified lightweight architectures +on the CPU and an edge device (NVIDIA Jetson Xavier NX). We analyze the +quantized multi-class anomaly detection models in terms of latency and memory +requirements for deployment on the edge device while comparing +quantization-aware training (QAT) and post-training quantization (PTQ) for +performance at different precision widths. In addition, we explored two +different methods of calibration required in post-training scenarios and show +that one of them performs notably better, highlighting its importance for +unsupervised tasks. Due to quantization, the performance drop in PTQ is further +compensated by QAT, which yields at par performance with the original 32-bit +Floating point in two of the models considered. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ 3D Multimodal Image Registration for Plant Phenotyping + + +
+ The use of multiple camera technologies in a combined multimodal monitoring +system for plant phenotyping offers promising benefits. Compared to +configurations that only utilize a single camera technology, cross-modal +patterns can be recorded that allow a more comprehensive assessment of plant +phenotypes. However, the effective utilization of cross-modal patterns is +dependent on precise image registration to achieve pixel-accurate alignment, a +challenge often complicated by parallax and occlusion effects inherent in plant +canopy imaging. + In this study, we propose a novel multimodal 3D image registration method +that addresses these challenges by integrating depth information from a +time-of-flight camera into the registration process. By leveraging depth data, +our method mitigates parallax effects and thus facilitates more accurate pixel +alignment across camera modalities. Additionally, we introduce an automated +mechanism to identify and differentiate different types of occlusions, thereby +minimizing the introduction of registration errors. + To evaluate the efficacy of our approach, we conduct experiments on a diverse +image dataset comprising six distinct plant species with varying leaf +geometries. Our results demonstrate the robustness of the proposed registration +algorithm, showcasing its ability to achieve accurate alignment across +different plant types and camera compositions. Compared to previous methods it +is not reliant on detecting plant specific image features and can thereby be +utilized for a wide variety of applications in plant sciences. The registration +approach principally scales to arbitrary numbers of cameras with different +resolutions and wavelengths. Overall, our study contributes to advancing the +field of plant phenotyping by offering a robust and reliable solution for +multimodal image registration. + +
+
+ comment: 53 pages, 13 Figures, preprint submitted to Computers and Electronics + in Agriculture +
+
+
+
+
+ + ☆ VEGS: View Extrapolation of Urban Scenes in 3D Gaussian Splatting using + Learned Priors + + +
+ Neural rendering-based urban scene reconstruction methods commonly rely on +images collected from driving vehicles with cameras facing and moving forward. +Although these methods can successfully synthesize from views similar to +training camera trajectory, directing the novel view outside the training +camera distribution does not guarantee on-par performance. In this paper, we +tackle the Extrapolated View Synthesis (EVS) problem by evaluating the +reconstructions on views such as looking left, right or downwards with respect +to training camera distributions. To improve rendering quality for EVS, we +initialize our model by constructing dense LiDAR map, and propose to leverage +prior scene knowledge such as surface normal estimator and large-scale +diffusion model. Qualitative and quantitative comparisons demonstrate the +effectiveness of our methods on EVS. To the best of our knowledge, we are the +first to address the EVS problem in urban scene reconstruction. Link to our +project page: https://vegs3d.github.io/. + +
+
+
+
+
+ + ☆ Recompression Based JPEG Tamper Detection and Localization Using Deep + Neural Network Eliminating Compression Factor Dependency + + +
+ In this work, we deal with the problem of re compression based image forgery +detection, where some regions of an image are modified illegitimately, hence +giving rise to presence of dual compression characteristics within a single +image. There have been some significant researches in this direction, in the +last decade. However, almost all existing techniques fail to detect this form +of forgery, when the first compression factor is greater than the second. We +address this problem in re compression based forgery detection, here Recently, +Machine Learning techniques have started gaining a lot of importance in the +domain of digital image forensics. In this work, we propose a Convolution +Neural Network based deep learning architecture, which is capable of detecting +the presence of re compression based forgery in JPEG images. The proposed +architecture works equally efficiently, even in cases where the first +compression ratio is greater than the second. In this work, we also aim to +localize the regions of image manipulation based on re compression features, +using the trained neural network. Our experimental results prove that the +proposed method outperforms the state of the art, with respect to forgery +detection and localization accuracy. + +
+
+ comment: 24 pages, conference +
+
+
+
+
+ + ☆ PosMLP-Video: Spatial and Temporal Relative Position Encoding for + Efficient Video Recognition + + +
+ In recent years, vision Transformers and MLPs have demonstrated remarkable +performance in image understanding tasks. However, their inherently dense +computational operators, such as self-attention and token-mixing layers, pose +significant challenges when applied to spatio-temporal video data. To address +this gap, we propose PosMLP-Video, a lightweight yet powerful MLP-like backbone +for video recognition. Instead of dense operators, we use efficient relative +positional encoding (RPE) to build pairwise token relations, leveraging +small-sized parameterized relative position biases to obtain each relation +score. Specifically, to enable spatio-temporal modeling, we extend the image +PosMLP's positional gating unit to temporal, spatial, and spatio-temporal +variants, namely PoTGU, PoSGU, and PoSTGU, respectively. These gating units can +be feasibly combined into three types of spatio-temporal factorized positional +MLP blocks, which not only decrease model complexity but also maintain good +performance. Additionally, we enrich relative positional relationships by using +channel grouping. Experimental results on three video-related tasks demonstrate +that PosMLP-Video achieves competitive speed-accuracy trade-offs compared to +the previous state-of-the-art models. In particular, PosMLP-Video pre-trained +on ImageNet1K achieves 59.0%/70.3% top-1 accuracy on Something-Something V1/V2 +and 82.1% top-1 accuracy on Kinetics-400 while requiring much fewer parameters +and FLOPs than other models. The code is released at +https://github.com/zhouds1918/PosMLP_Video. + +
+
+
+
+
+ + ☆ Explainable vertebral fracture analysis with uncertainty estimation + using differentiable rule-based classification MICCAI 2024 + + +
+ We present a novel method for explainable vertebral fracture assessment +(XVFA) in low-dose radiographs using deep neural networks, incorporating +vertebra detection and keypoint localization with uncertainty estimates. We +incorporate Genant's semi-quantitative criteria as a differentiable rule-based +means of classifying both vertebra fracture grade and morphology. Unlike +previous work, XVFA provides explainable classifications relatable to current +clinical methodology, as well as uncertainty estimations, while at the same +time surpassing state-of-the art methods with a vertebra-level sensitivity of +93% and end-to-end AUC of 97% in a challenging setting. Moreover, we compare +intra-reader agreement with model uncertainty estimates, with model reliability +on par with human annotators. + +
+
+ comment: To be published in MICCAI 2024 conference proceedings +
+
+
+
+
+ + ☆ EgoFlowNet: Non-Rigid Scene Flow from Point Clouds with Ego-Motion + Support BMVC2023 + + +
+ Recent weakly-supervised methods for scene flow estimation from LiDAR point +clouds are limited to explicit reasoning on object-level. These methods perform +multiple iterative optimizations for each rigid object, which makes them +vulnerable to clustering robustness. In this paper, we propose our EgoFlowNet - +a point-level scene flow estimation network trained in a weakly-supervised +manner and without object-based abstraction. Our approach predicts a binary +segmentation mask that implicitly drives two parallel branches for ego-motion +and scene flow. Unlike previous methods, we provide both branches with all +input points and carefully integrate the binary mask into the feature +extraction and losses. We also use a shared cost volume with local refinement +that is updated at multiple scales without explicit clustering or rigidity +assumptions. On realistic KITTI scenes, we show that our EgoFlowNet performs +better than state-of-the-art methods in the presence of ground surface points. + +
+
+ comment: This paper is published in BMVC2023 (pp. 441-443) +
+
+
+
+
+ + ☆ Free-SurGS: SfM-Free 3D Gaussian Splatting for Surgical Scene + Reconstruction MICCAI 2024 + + +
+ Real-time 3D reconstruction of surgical scenes plays a vital role in +computer-assisted surgery, holding a promise to enhance surgeons' visibility. +Recent advancements in 3D Gaussian Splatting (3DGS) have shown great potential +for real-time novel view synthesis of general scenes, which relies on accurate +poses and point clouds generated by Structure-from-Motion (SfM) for +initialization. However, 3DGS with SfM fails to recover accurate camera poses +and geometry in surgical scenes due to the challenges of minimal textures and +photometric inconsistencies. To tackle this problem, in this paper, we propose +the first SfM-free 3DGS-based method for surgical scene reconstruction by +jointly optimizing the camera poses and scene representation. Based on the +video continuity, the key of our method is to exploit the immediate optical +flow priors to guide the projection flow derived from 3D Gaussians. Unlike most +previous methods relying on photometric loss only, we formulate the pose +estimation problem as minimizing the flow loss between the projection flow and +optical flow. A consistency check is further introduced to filter the flow +outliers by detecting the rigid and reliable points that satisfy the epipolar +geometry. During 3D Gaussian optimization, we randomly sample frames to +optimize the scene representations to grow the 3D Gaussian progressively. +Experiments on the SCARED dataset demonstrate our superior performance over +existing methods in novel view synthesis and pose estimation with high +efficiency. Code is available at https://github.com/wrld/Free-SurGS. + +
+
+ comment: Accepted to MICCAI 2024 +
+
+
+
+
+ + ☆ Non-Adversarial Learning: Vector-Quantized Common Latent Space for + Multi-Sequence MRI + + +
+ Adversarial learning helps generative models translate MRI from source to +target sequence when lacking paired samples. However, implementing MRI +synthesis with adversarial learning in clinical settings is challenging due to +training instability and mode collapse. To address this issue, we leverage +intermediate sequences to estimate the common latent space among multi-sequence +MRI, enabling the reconstruction of distinct sequences from the common latent +space. We propose a generative model that compresses discrete representations +of each sequence to estimate the Gaussian distribution of vector-quantized +common (VQC) latent space between multiple sequences. Moreover, we improve the +latent space consistency with contrastive learning and increase model stability +by domain augmentation. Experiments using BraTS2021 dataset show that our +non-adversarial model outperforms other GAN-based methods, and VQC latent space +aids our model to achieve (1) anti-interference ability, which can eliminate +the effects of noise, bias fields, and artifacts, and (2) solid semantic +representation ability, with the potential of one-shot segmentation. Our code +is publicly available. + +
+
+
+
+
+ + ☆ Domain-independent detection of known anomalies CVPR 2024 + + +
+ One persistent obstacle in industrial quality inspection is the detection of +anomalies. In real-world use cases, two problems must be addressed: anomalous +data is sparse and the same types of anomalies need to be detected on +previously unseen objects. Current anomaly detection approaches can be trained +with sparse nominal data, whereas domain generalization approaches enable +detecting objects in previously unseen domains. Utilizing those two +observations, we introduce the hybrid task of domain generalization on sparse +classes. To introduce an accompanying dataset for this task, we present a +modification of the well-established MVTec AD dataset by generating three new +datasets. In addition to applying existing methods for benchmark, we design two +embedding-based approaches, Spatial Embedding MLP (SEMLP) and Labeled +PatchCore. Overall, SEMLP achieves the best performance with an average +image-level AUROC of 87.2 % vs. 80.4 % by MIRO. The new and openly available +datasets allow for further research to improve industrial anomaly detection. + +
+
+ comment: Accepted as extended abstract in CVPR 2024 workshop VAND 2.0 +
+
+
+
+
+ + ☆ Single Image Rolling Shutter Removal with Diffusion Models + + +
+ We present RS-Diffusion, the first Diffusion Models-based method for +single-frame Rolling Shutter (RS) correction. RS artifacts compromise visual +quality of frames due to the row wise exposure of CMOS sensors. Most previous +methods have focused on multi-frame approaches, using temporal information from +consecutive frames for the motion rectification. However, few approaches +address the more challenging but important single frame RS correction. In this +work, we present an ``image-to-motion'' framework via diffusion techniques, +with a designed patch-attention module. In addition, we present the RS-Real +dataset, comprised of captured RS frames alongside their corresponding Global +Shutter (GS) ground-truth pairs. The GS frames are corrected from the RS ones, +guided by the corresponding Inertial Measurement Unit (IMU) gyroscope data +acquired during capture. Experiments show that our RS-Diffusion surpasses +previous single RS correction methods. Our method and proposed RS-Real dataset +lay a solid foundation for advancing the field of RS correction. + +
+
+
+
+
+ + ☆ Self-supervised Vision Transformer are Scalable Generative Models for + Domain Generalization MICCAI 2024 + + +
+ Despite notable advancements, the integration of deep learning (DL) +techniques into impactful clinical applications, particularly in the realm of +digital histopathology, has been hindered by challenges associated with +achieving robust generalization across diverse imaging domains and +characteristics. Traditional mitigation strategies in this field such as data +augmentation and stain color normalization have proven insufficient in +addressing this limitation, necessitating the exploration of alternative +methodologies. To this end, we propose a novel generative method for domain +generalization in histopathology images. Our method employs a generative, +self-supervised Vision Transformer to dynamically extract characteristics of +image patches and seamlessly infuse them into the original images, thereby +creating novel, synthetic images with diverse attributes. By enriching the +dataset with such synthesized images, we aim to enhance its holistic nature, +facilitating improved generalization of DL models to unseen domains. Extensive +experiments conducted on two distinct histopathology datasets demonstrate the +effectiveness of our proposed approach, outperforming the state of the art +substantially, on the Camelyon17-wilds challenge dataset (+2%) and on a second +epithelium-stroma dataset (+26%). Furthermore, we emphasize our method's +ability to readily scale with increasingly available unlabeled data samples and +more complex, higher parametric architectures. Source code is available at +https://github.com/sdoerrich97/vits-are-generative-models . + +
+
+ comment: Accepted at MICCAI 2024. This is the submitted manuscript with added + link to github repo and funding acknowledgements. No further post submission + improvements or corrections were integrated. Final version not published yet +
+
+
+
+
+ + ☆ An Uncertainty-guided Tiered Self-training Framework for Active + Source-free Domain Adaptation in Prostate Segmentation MICCAI 2024 + + +
+ Deep learning models have exhibited remarkable efficacy in accurately +delineating the prostate for diagnosis and treatment of prostate diseases, but +challenges persist in achieving robust generalization across different medical +centers. Source-free Domain Adaptation (SFDA) is a promising technique to adapt +deep segmentation models to address privacy and security concerns while +reducing domain shifts between source and target domains. However, recent +literature indicates that the performance of SFDA remains far from satisfactory +due to unpredictable domain gaps. Annotating a few target domain samples is +acceptable, as it can lead to significant performance improvement with a low +annotation cost. Nevertheless, due to extremely limited annotation budgets, +careful consideration is needed in selecting samples for annotation. Inspired +by this, our goal is to develop Active Source-free Domain Adaptation (ASFDA) +for medical image segmentation. Specifically, we propose a novel +Uncertainty-guided Tiered Self-training (UGTST) framework, consisting of +efficient active sample selection via entropy-based primary local peak +filtering to aggregate global uncertainty and diversity-aware redundancy +filter, coupled with a tiered self-learning strategy, achieves stable domain +adaptation. Experimental results on cross-center prostate MRI segmentation +datasets revealed that our method yielded marked advancements, with a mere 5% +annotation, exhibiting an average Dice score enhancement of 9.78% and 7.58% in +two target domains compared with state-of-the-art methods, on par with fully +supervised learning. Code is available at:https://github.com/HiLab-git/UGTST + +
+
+ comment: 11 pages, 3 figures, 2 tables, accept to MICCAI 2024 +
+
+
+
+
+ + ☆ Explicitly Guided Information Interaction Network for Cross-modal Point + Cloud Completion ECCV 2024 + + +
+ Corresponding author}In this paper, we explore a novel framework, EGIInet +(Explicitly Guided Information Interaction Network), a model for View-guided +Point cloud Completion (ViPC) task, which aims to restore a complete point +cloud from a partial one with a single view image. In comparison with previous +methods that relied on the global semantics of input images, EGIInet +efficiently combines the information from two modalities by leveraging the +geometric nature of the completion task. Specifically, we propose an explicitly +guided information interaction strategy supported by modal alignment for point +cloud completion. First, in contrast to previous methods which simply use 2D +and 3D backbones to encode features respectively, we unified the encoding +process to promote modal alignment. Second, we propose a novel explicitly +guided information interaction strategy that could help the network identify +critical information within images, thus achieving better guidance for +completion. Extensive experiments demonstrate the effectiveness of our +framework, and we achieved a new state-of-the-art (+16\% CD over XMFnet) in +benchmark datasets despite using fewer parameters than the previous methods. +The pre-trained model and code and are available at +https://github.com/WHU-USI3DV/EGIInet. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ ShiftAddAug: Augment Multiplication-Free Tiny Neural Network with Hybrid + Computation CVPR + + +
+ Operators devoid of multiplication, such as Shift and Add, have gained +prominence for their compatibility with hardware. However, neural networks +(NNs) employing these operators typically exhibit lower accuracy compared to +conventional NNs with identical structures. ShiftAddAug uses costly +multiplication to augment efficient but less powerful multiplication-free +operators, improving performance without any inference overhead. It puts a +ShiftAdd tiny NN into a large multiplicative model and encourages it to be +trained as a sub-model to obtain additional supervision. In order to solve the +weight discrepancy problem between hybrid operators, a new weight sharing +method is proposed. Additionally, a novel two stage neural architecture search +is used to obtain better augmentation effects for smaller but stronger +multiplication-free tiny neural networks. The superiority of ShiftAddAug is +validated through experiments in image classification and semantic +segmentation, consistently delivering noteworthy enhancements. Remarkably, it +secures up to a 4.95% increase in accuracy on the CIFAR100 compared to its +directly trained counterparts, even surpassing the performance of +multiplicative NNs. + +
+
+ comment: Accepted by 2024 CVPR Workshop : Efficient Deep Learning for Computer + Vision +
+
+
+
+
+ + ☆ Knowledge Composition using Task Vectors with Learned Anisotropic + Scaling + + +
+ Pre-trained models produce strong generic representations that can be adapted +via fine-tuning. The learned weight difference relative to the pre-trained +model, known as a task vector, characterises the direction and stride of +fine-tuning. The significance of task vectors is such that simple arithmetic +operations on them can be used to combine diverse representations from +different domains. This paper builds on these properties of task vectors and +aims to answer (1) whether components of task vectors, particularly parameter +blocks, exhibit similar characteristics, and (2) how such blocks can be used to +enhance knowledge composition and transfer. To this end, we introduce aTLAS, an +algorithm that linearly combines parameter blocks with different learned +coefficients, resulting in anisotropic scaling at the task vector level. We +show that such linear combinations explicitly exploit the low intrinsic +dimensionality of pre-trained models, with only a few coefficients being the +learnable parameters. Furthermore, composition of parameter blocks leverages +the already learned representations, thereby reducing the dependency on large +amounts of data. We demonstrate the effectiveness of our method in task +arithmetic, few-shot recognition and test-time adaptation, with supervised or +unsupervised objectives. In particular, we show that (1) learned anisotropic +scaling allows task vectors to be more disentangled, causing less interference +in composition; (2) task vector composition excels with scarce or no labeled +data and is less prone to domain shift, thus leading to better +generalisability; (3) mixing the most informative parameter blocks across +different task vectors prior to training can reduce the memory footprint and +improve the flexibility of knowledge transfer. Moreover, we show the potential +of aTLAS as a PEFT method, particularly with less data, and demonstrate that +its scalibility. + +
+
+
+
+
+ + ☆ LMBF-Net: A Lightweight Multipath Bidirectional Focal Attention Network + for Multifeatures Segmentation + + +
+ Retinal diseases can cause irreversible vision loss in both eyes if not +diagnosed and treated early. Since retinal diseases are so complicated, retinal +imaging is likely to show two or more abnormalities. Current deep learning +techniques for segmenting retinal images with many labels and attributes have +poor detection accuracy and generalisability. This paper presents a multipath +convolutional neural network for multifeature segmentation. The proposed +network is lightweight and spatially sensitive to information. A patch-based +implementation is used to extract local image features, and focal modulation +attention blocks are incorporated between the encoder and the decoder for +improved segmentation. Filter optimisation is used to prevent filter overlaps +and speed up model convergence. A combination of convolution operations and +group convolution operations is used to reduce computational costs. This is the +first robust and generalisable network capable of segmenting multiple features +of fundus images (including retinal vessels, microaneurysms, optic discs, +haemorrhages, hard exudates, and soft exudates). The results of our +experimental evaluation on more than ten publicly available datasets with +multiple features show that the proposed network outperforms recent networks +despite having a small number of learnable parameters. + +
+
+
+
+
+ + ☆ Fast maneuver recovery from aerial observation: trajectory clustering + and outliers rejection + + +
+ The implementation of road user models that realistically reproduce a +credible behavior in a multi-agentsimulation is still an open problem. A +data-driven approach consists on to deduce behaviors that may exist in real +situation to obtain different types of trajectories from a large set of +observations. The data, and its classification, could then be used to train +models capable to extrapolate such behavior. Cars and two different types of +Vulnerable Road Users (VRU) will be considered by the trajectory clustering +methods proposed: pedestrians and cyclists. The results reported here evaluate +methods to extract well-defined trajectory classes from raw data without the +use of map information while also separating ''eccentric'' or incomplete +trajectories from the ones that are complete and representative in any +scenario. Two environments will serve as test for the methods develop, three +different intersections and one roundabout. The resulting clusters of +trajectories can then be used for prediction or learning tasks or discarded if +it is composed by outliers. + +
+
+
+
+
+ + ☆ Universal Gloss-level Representation for Gloss-free Sign Language + Translation and Production + + +
+ Sign language, essential for the deaf and hard-of-hearing, presents unique +challenges in translation and production due to its multimodal nature and the +inherent ambiguity in mapping sign language motion to spoken language words. +Previous methods often rely on gloss annotations, requiring time-intensive +labor and specialized expertise in sign language. Gloss-free methods have +emerged to address these limitations, but they often depend on external sign +language data or dictionaries, failing to completely eliminate the need for +gloss annotations. There is a clear demand for a comprehensive approach that +can supplant gloss annotations and be utilized for both Sign Language +Translation (SLT) and Sign Language Production (SLP). We introduce Universal +Gloss-level Representation (UniGloR), a unified and self-supervised solution +for both SLT and SLP, trained on multiple datasets including PHOENIX14T, +How2Sign, and NIASL2021. Our results demonstrate UniGloR's effectiveness in the +translation and production tasks. We further report an encouraging result for +the Sign Language Recognition (SLR) on previously unseen data. Our study +suggests that self-supervised learning can be made in a unified manner, paving +the way for innovative and practical applications in future research. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Plant Doctor: A hybrid machine learning and image segmentation software + to quantify plant damage in video footage + + +
+ Artificial intelligence has significantly advanced the automation of +diagnostic processes, benefiting various fields including agriculture. This +study introduces an AI-based system for the automatic diagnosis of urban street +plants using video footage obtained with accessible camera devices. The system +aims to monitor plant health on a day-to-day basis, aiding in the control of +disease spreading in urban areas. By combining two machine vision algorithms, +YOLOv8 and DeepSORT, the system efficiently identifies and tracks individual +leaves, extracting the optimal images for health analysis. YOLOv8, chosen for +its speed and computational efficiency, locates leaves, while DeepSORT ensures +robust tracking in complex environments. For detailed health assessment, +DeepLabV3Plus, a convolutional neural network, is employed to segment and +quantify leaf damage caused by bacteria, pests, and fungi. The hybrid system, +named Plant Doctor, has been trained and validated using a diverse dataset +including footage from Tokyo urban plants. The results demonstrate the +robustness and accuracy of the system in diagnosing leaf damage, with potential +applications in large scale urban flora illness monitoring. This approach +provides a non-invasive, efficient, and scalable solution for urban tree health +management, supporting sustainable urban ecosystems. + +
+
+ comment: 29 pages, 10 figures, 2 tables +
+
+
+
+
+ + ☆ Multi-Task Domain Adaptation for Language Grounding with 3D Objects + + +
+ The existing works on object-level language grounding with 3D objects mostly +focus on improving performance by utilizing the off-the-shelf pre-trained +models to capture features, such as viewpoint selection or geometric priors. +However, they have failed to consider exploring the cross-modal representation +of language-vision alignment in the cross-domain field. To answer this problem, +we propose a novel method called Domain Adaptation for Language Grounding +(DA4LG) with 3D objects. Specifically, the proposed DA4LG consists of a visual +adapter module with multi-task learning to realize vision-language alignment by +comprehensive multimodal feature representation. Experimental results +demonstrate that DA4LG competitively performs across visual and non-visual +language descriptions, independent of the completeness of observation. DA4LG +achieves state-of-the-art performance in the single-view setting and multi-view +setting with the accuracy of 83.8% and 86.8% respectively in the language +grounding benchmark SNARE. The simulation experiments show the well-practical +and generalized performance of DA4LG compared to the existing methods. Our +project is available at https://sites.google.com/view/da4lg. + +
+
+
+
+
+ + ☆ Multi-Attention Integrated Deep Learning Frameworks for Enhanced Breast + Cancer Segmentation and Identification + + +
+ Breast cancer poses a profound threat to lives globally, claiming numerous +lives each year. Therefore, timely detection is crucial for early intervention +and improved chances of survival. Accurately diagnosing and classifying breast +tumors using ultrasound images is a persistent challenge in medicine, demanding +cutting-edge solutions for improved treatment strategies. This research +introduces multiattention-enhanced deep learning (DL) frameworks designed for +the classification and segmentation of breast cancer tumors from ultrasound +images. A spatial channel attention mechanism is proposed for segmenting tumors +from ultrasound images, utilizing a novel LinkNet DL framework with an +InceptionResNet backbone. Following this, the paper proposes a deep +convolutional neural network with an integrated multi-attention framework +(DCNNIMAF) to classify the segmented tumor as benign, malignant, or normal. +From experimental results, it is observed that the segmentation model has +recorded an accuracy of 98.1%, with a minimal loss of 0.6%. It has also +achieved high Intersection over Union (IoU) and Dice Coefficient scores of +96.9% and 97.2%, respectively. Similarly, the classification model has attained +an accuracy of 99.2%, with a low loss of 0.31%. Furthermore, the classification +framework has achieved outstanding F1-Score, precision, and recall values of +99.1%, 99.3%, and 99.1%, respectively. By offering a robust framework for early +detection and accurate classification of breast cancer, this proposed work +significantly advances the field of medical image analysis, potentially +improving diagnostic precision and patient outcomes. + +
+
+ comment: 32 pages, 18 figures, 6 tables +
+
+
+
+
+ + ☆ MindBench: A Comprehensive Benchmark for Mind Map Structure Recognition + and Analysis + + +
+ Multimodal Large Language Models (MLLM) have made significant progress in the +field of document analysis. Despite this, existing benchmarks typically focus +only on extracting text and simple layout information, neglecting the complex +interactions between elements in structured documents such as mind maps and +flowcharts. To address this issue, we introduce the new benchmark named +MindBench, which not only includes meticulously constructed bilingual authentic +or synthetic images, detailed annotations, evaluation metrics and baseline +models, but also specifically designs five types of structured understanding +and parsing tasks. These tasks include full parsing, partial parsing, +position-related parsing, structured Visual Question Answering (VQA), and +position-related VQA, covering key areas such as text recognition, spatial +awareness, relationship discernment, and structured parsing. Extensive +experimental results demonstrate the substantial potential and significant room +for improvement in current models' ability to handle structured document +information. We anticipate that the launch of MindBench will significantly +advance research and application development in structured document analysis +technology. MindBench is available at: +https://miasanlei.github.io/MindBench.github.io/. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ A Pairwise DomMix Attentive Adversarial Network for Unsupervised Domain + Adaptive Object Detection + + +
+ Unsupervised Domain Adaptive Object Detection (DAOD) could adapt a model +trained on a source domain to an unlabeled target domain for object detection. +Existing unsupervised DAOD methods usually perform feature alignments from the +target to the source. Unidirectional domain transfer would omit information +about the target samples and result in suboptimal adaptation when there are +large domain shifts. Therefore, we propose a pairwise attentive adversarial +network with a Domain Mixup (DomMix) module to mitigate the aforementioned +challenges. Specifically, a deep-level mixup is employed to construct an +intermediate domain that allows features from both domains to share their +differences. Then a pairwise attentive adversarial network is applied with +attentive encoding on both image-level and instance-level features at different +scales and optimizes domain alignment by adversarial learning. This allows the +network to focus on regions with disparate contextual information and learn +their similarities between different domains. Extensive experiments are +conducted on several benchmark datasets, demonstrating the superiority of our +proposed method. + +
+
+ comment: has published on IEEE Signal Processing Letters, 2023 +
+
+
+
+
+ + ☆ Style Alignment based Dynamic Observation Method for UAV-View + Geo-localization + + +
+ The task of UAV-view geo-localization is to estimate the localization of a +query satellite/drone image by matching it against a reference dataset +consisting of drone/satellite images. Though tremendous strides have been made +in feature alignment between satellite and drone views, vast differences in +both inter and intra-class due to changes in viewpoint, altitude, and lighting +remain a huge challenge. In this paper, a style alignment based dynamic +observation method for UAV-view geo-localization is proposed to meet the above +challenges from two perspectives: visual style transformation and surrounding +noise control. Specifically, we introduce a style alignment strategy to +transfrom the diverse visual style of drone-view images into a unified +satellite images visual style. Then a dynamic observation module is designed to +evaluate the spatial distribution of images by mimicking human observation +habits. It is featured by the hierarchical attention block (HAB) with a +dual-square-ring stream structure, to reduce surrounding noise and geographical +deformation. In addition, we propose a deconstruction loss to push away +features of different geo-tags and squeeze knowledge from unmatched images by +correlation calculation. The experimental results demonstrate the +state-of-the-art performance of our model on benchmarked datasets. In +particular, when compared to the prior art on University-1652, our results +surpass the best of them (FSRA), while only requiring 2x fewer parameters. Code +will be released at https://github.com/Xcco1/SA\_DOM + +
+
+ comment: has published on IEEE Transactions on Geoscience and Remote Sensing, + 2023 +
+
+
+
+
+ + ☆ A Radiometric Correction based Optical Modeling Approach to Removing + Reflection Noise in TLS Point Clouds of Urban Scenes + + +
+ Point clouds are vital in computer vision tasks such as 3D reconstruction, +autonomous driving, and robotics. However, TLS-acquired point clouds often +contain virtual points from reflective surfaces, causing disruptions. This +study presents a reflection noise elimination algorithm for TLS point clouds. +Our innovative reflection plane detection algorithm, based on geometry-optical +models and physical properties, identifies and categorizes reflection points +per optical reflection theory. We've adapted the LSFH feature descriptor to +retain reflection features, mitigating interference from symmetrical +architectural structures. By incorporating the Hausdorff feature distance, the +algorithm enhances resilience to ghosting and deformation, improving virtual +point detection accuracy. Extensive experiments on the 3DRN benchmark dataset, +featuring diverse urban environments with virtual TLS reflection noise, show +our algorithm improves precision and recall rates for 3D points in reflective +regions by 57.03\% and 31.80\%, respectively. Our method achieves a 9.17\% +better outlier detection rate and 5.65\% higher accuracy than leading methods. +Access the 3DRN dataset at (https://github.com/Tsuiky/3DRN). + +
+
+
+
+
+ + ☆ Images Speak Louder than Words: Understanding and Mitigating Bias in + Vision-Language Model from a Causal Mediation Perspective + + +
+ Vision-language models (VLMs) pre-trained on extensive datasets can +inadvertently learn biases by correlating gender information with specific +objects or scenarios. Current methods, which focus on modifying inputs and +monitoring changes in the model's output probability scores, often struggle to +comprehensively understand bias from the perspective of model components. We +propose a framework that incorporates causal mediation analysis to measure and +map the pathways of bias generation and propagation within VLMs. This approach +allows us to identify the direct effects of interventions on model bias and the +indirect effects of interventions on bias mediated through different model +components. Our results show that image features are the primary contributors +to bias, with significantly higher impacts than text features, specifically +accounting for 32.57% and 12.63% of the bias in the MSCOCO and PASCAL-SENTENCE +datasets, respectively. Notably, the image encoder's contribution surpasses +that of the text encoder and the deep fusion encoder. Further experimentation +confirms that contributions from both language and vision modalities are +aligned and non-conflicting. Consequently, focusing on blurring gender +representations within the image encoder, which contributes most to the model +bias, reduces bias efficiently by 22.03% and 9.04% in the MSCOCO and +PASCAL-SENTENCE datasets, respectively, with minimal performance loss or +increased computational demands. + +
+
+
+
+
+ + ☆ Data Overfitting for On-Device Super-Resolution with Dynamic Algorithm + and Compiler Co-Design ECCV2024 + + +
+ Deep neural networks (DNNs) are frequently employed in a variety of computer +vision applications. Nowadays, an emerging trend in the current video +distribution system is to take advantage of DNN's overfitting properties to +perform video resolution upscaling. By splitting videos into chunks and +applying a super-resolution (SR) model to overfit each chunk, this scheme of SR +models plus video chunks is able to replace traditional video transmission to +enhance video quality and transmission efficiency. However, many models and +chunks are needed to guarantee high performance, which leads to tremendous +overhead on model switching and memory footprints at the user end. To resolve +such problems, we propose a Dynamic Deep neural network assisted by a +Content-Aware data processing pipeline to reduce the model number down to one +(Dy-DCA), which helps promote performance while conserving computational +resources. Additionally, to achieve real acceleration on the user end, we +designed a framework that optimizes dynamic features (e.g., dynamic shapes, +sizes, and control flow) in Dy-DCA to enable a series of compilation +optimizations, including fused code generation, static execution planning, etc. +By employing such techniques, our method achieves better PSNR and real-time +performance (33 FPS) on an off-the-shelf mobile phone. Meanwhile, assisted by +our compilation optimization, we achieve a 1.7$\times$ speedup while saving up +to 1.61$\times$ memory consumption. Code available in +https://github.com/coulsonlee/Dy-DCA-ECCV2024. + +
+
+ comment: ECCV2024 +
+
+
+
+
+ + ☆ Solving Motion Planning Tasks with a Scalable Generative Model ECCV2024 + + +
+ As autonomous driving systems being deployed to millions of vehicles, there +is a pressing need of improving the system's scalability, safety and reducing +the engineering cost. A realistic, scalable, and practical simulator of the +driving world is highly desired. In this paper, we present an efficient +solution based on generative models which learns the dynamics of the driving +scenes. With this model, we can not only simulate the diverse futures of a +given driving scenario but also generate a variety of driving scenarios +conditioned on various prompts. Our innovative design allows the model to +operate in both full-Autoregressive and partial-Autoregressive modes, +significantly improving inference and training speed without sacrificing +generative capability. This efficiency makes it ideal for being used as an +online reactive environment for reinforcement learning, an evaluator for +planning policies, and a high-fidelity simulator for testing. We evaluated our +model against two real-world datasets: the Waymo motion dataset and the nuPlan +dataset. On the simulation realism and scene generation benchmark, our model +achieves the state-of-the-art performance. And in the planning benchmarks, our +planner outperforms the prior arts. We conclude that the proposed generative +model may serve as a foundation for a variety of motion planning tasks, +including data generation, simulation, planning, and online training. Source +code is public at https://github.com/HorizonRobotics/GUMP/ + +
+
+ comment: ECCV2024 +
+
+
+
+
+ + ☆ Euler's Elastica Based Cartoon-Smooth-Texture Image Decomposition + + +
+ We propose a novel model for decomposing grayscale images into three distinct +components: the structural part, representing sharp boundaries and regions with +strong light-to-dark transitions; the smooth part, capturing soft shadows and +shades; and the oscillatory part, characterizing textures and noise. To capture +the homogeneous structures, we introduce a combination of $L^0$-gradient and +curvature regularization on level lines. This new regularization term enforces +strong sparsity on the image gradient while reducing the undesirable staircase +effects as well as preserving the geometry of contours. For the smoothly +varying component, we utilize the $L^2$-norm of the Laplacian that favors +isotropic smoothness. To capture the oscillation, we use the inverse Sobolev +seminorm. To solve the associated minimization problem, we design an efficient +operator-splitting algorithm. Our algorithm effectively addresses the +challenging non-convex non-smooth problem by separating it into sub-problems. +Each sub-problem can be solved either directly using closed-form solutions or +efficiently using the Fast Fourier Transform (FFT). We provide systematic +experiments, including ablation and comparison studies, to analyze our model's +behaviors and demonstrate its effectiveness as well as efficiency. + +
+
+
+
+
+ + ☆ Learning Positional Attention for Sequential Recommendation + + +
+ Self-attention-based networks have achieved remarkable performance in +sequential recommendation tasks. A crucial component of these models is +positional encoding. In this study, we delve into the learned positional +embedding, demonstrating that it often captures the distance between tokens. +Building on this insight, we introduce novel attention models that directly +learn positional relations. Extensive experiments reveal that our proposed +models, \textbf{PARec} and \textbf{FPARec} outperform previous +self-attention-based approaches.Our code is available at the link for anonymous +review: https://anonymous.4open.science/ r/FPARec-2C55/ + +
+
+
+
+
+ + ☆ Foster Adaptivity and Balance in Learning with Noisy Labels ECCV + + +
+ Label noise is ubiquitous in real-world scenarios, posing a practical +challenge to supervised models due to its effect in hurting the generalization +performance of deep neural networks. Existing methods primarily employ the +sample selection paradigm and usually rely on dataset-dependent prior knowledge +(\eg, a pre-defined threshold) to cope with label noise, inevitably degrading +the adaptivity. Moreover, existing methods tend to neglect the class balance in +selecting samples, leading to biased model performance. To this end, we propose +a simple yet effective approach named \textbf{SED} to deal with label noise in +a \textbf{S}elf-adaptiv\textbf{E} and class-balance\textbf{D} manner. +Specifically, we first design a novel sample selection strategy to empower +self-adaptivity and class balance when identifying clean and noisy data. A +mean-teacher model is then employed to correct labels of noisy samples. +Subsequently, we propose a self-adaptive and class-balanced sample re-weighting +mechanism to assign different weights to detected noisy samples. Finally, we +additionally employ consistency regularization on selected clean samples to +improve model generalization performance. Extensive experimental results on +synthetic and real-world datasets demonstrate the effectiveness and superiority +of our proposed method. The source code has been made available at +https://github.com/NUST-Machine-Intelligence-Laboratory/SED. + +
+
+ comment: accepted by the European Conference on Computer Vision (ECCV), 2024 +
+
+
+
+
+ + ☆ Automatic gradient descent with generalized Newton's method + + +
+ We propose the generalized Newton's method (GeN) -- a Hessian-informed +approach that applies to any optimizer such as SGD and Adam, and covers the +Newton-Raphson method as a sub-case. Our method automatically and dynamically +selects the learning rate that accelerates the convergence, without the +intensive tuning of the learning rate scheduler. In practice, out method is +easily implementable, since it only requires additional forward passes with +almost zero computational overhead (in terms of training time and memory cost), +if the overhead is amortized over many iterations. We present extensive +experiments on language and vision tasks (e.g. GPT and ResNet) to showcase that +GeN optimizers match the state-of-the-art performance, which was achieved with +carefully tuned learning rate schedulers. Code to be released at +\url{https://github.com/ShiyunXu/AutoGeN}. + +
+
+
+
+
+ + ☆ Fine-Grained Scene Image Classification with Modality-Agnostic Adapter + + +
+ When dealing with the task of fine-grained scene image classification, most +previous works lay much emphasis on global visual features when doing +multi-modal feature fusion. In other words, models are deliberately designed +based on prior intuitions about the importance of different modalities. In this +paper, we present a new multi-modal feature fusion approach named MAA +(Modality-Agnostic Adapter), trying to make the model learn the importance of +different modalities in different cases adaptively, without giving a prior +setting in the model architecture. More specifically, we eliminate the modal +differences in distribution and then use a modality-agnostic Transformer +encoder for a semantic-level feature fusion. Our experiments demonstrate that +MAA achieves state-of-the-art results on benchmarks by applying the same +modalities with previous methods. Besides, it is worth mentioning that new +modalities can be easily added when using MAA and further boost the +performance. Code is available at https://github.com/quniLcs/MAA. + +
+
+
+
+
+ + ☆ Knowledge Transfer with Simulated Inter-Image Erasing for Weakly + Supervised Semantic Segmentation ECCV + + +
+ Though adversarial erasing has prevailed in weakly supervised semantic +segmentation to help activate integral object regions, existing approaches +still suffer from the dilemma of under-activation and over-expansion due to the +difficulty in determining when to stop erasing. In this paper, we propose a +\textbf{K}nowledge \textbf{T}ransfer with \textbf{S}imulated Inter-Image +\textbf{E}rasing (KTSE) approach for weakly supervised semantic segmentation to +alleviate the above problem. In contrast to existing erasing-based methods that +remove the discriminative part for more object discovery, we propose a +simulated inter-image erasing scenario to weaken the original activation by +introducing extra object information. Then, object knowledge is transferred +from the anchor image to the consequent less activated localization map to +strengthen network localization ability. Considering the adopted bidirectional +alignment will also weaken the anchor image activation if appropriate +constraints are missing, we propose a self-supervised regularization module to +maintain the reliable activation in discriminative regions and improve the +inter-class object boundary recognition for complex images with multiple +categories of objects. In addition, we resort to intra-image erasing and +propose a multi-granularity alignment module to gently enlarge the object +activation to boost the object knowledge transfer. Extensive experiments and +ablation studies on PASCAL VOC 2012 and COCO datasets demonstrate the +superiority of our proposed approach. Source codes and models are available at +https://github.com/NUST-Machine-Intelligence-Laboratory/KTSE. + +
+
+ comment: accepted by the European Conference on Computer Vision (ECCV), 2024 +
+
+
+
+
+ + ☆ ADFQ-ViT: Activation-Distribution-Friendly Post-Training Quantization + for Vision Transformers + + +
+ Vision Transformers (ViTs) have exhibited exceptional performance across +diverse computer vision tasks, while their substantial parameter size incurs +significantly increased memory and computational demands, impeding effective +inference on resource-constrained devices. Quantization has emerged as a +promising solution to mitigate these challenges, yet existing methods still +suffer from significant accuracy loss at low-bit. We attribute this issue to +the distinctive distributions of post-LayerNorm and post-GELU activations +within ViTs, rendering conventional hardware-friendly quantizers ineffective, +particularly in low-bit scenarios. To address this issue, we propose a novel +framework called Activation-Distribution-Friendly post-training Quantization +for Vision Transformers, ADFQ-ViT. Concretely, we introduce the Per-Patch +Outlier-aware Quantizer to tackle irregular outliers in post-LayerNorm +activations. This quantizer refines the granularity of the uniform quantizer to +a per-patch level while retaining a minimal subset of values exceeding a +threshold at full-precision. To handle the non-uniform distributions of +post-GELU activations between positive and negative regions, we design the +Shift-Log2 Quantizer, which shifts all elements to the positive region and then +applies log2 quantization. Moreover, we present the Attention-score enhanced +Module-wise Optimization which adjusts the parameters of each quantizer by +reconstructing errors to further mitigate quantization error. Extensive +experiments demonstrate ADFQ-ViT provides significant improvements over various +baselines in image classification, object detection, and instance segmentation +tasks at 4-bit. Specifically, when quantizing the ViT-B model to 4-bit, we +achieve a 10.23% improvement in Top-1 accuracy on the ImageNet dataset. + +
+
+ comment: 28 pages,9 figures +
+
+
+
+
+ + ☆ Differential Encoding for Improved Representation Learning over Graphs + + +
+ Combining the message-passing paradigm with the global attention mechanism +has emerged as an effective framework for learning over graphs. The +message-passing paradigm and the global attention mechanism fundamentally +generate node embeddings based on information aggregated from a node's local +neighborhood or from the whole graph. The most basic and commonly used +aggregation approach is to take the sum of information from a node's local +neighbourhood or from the whole graph. However, it is unknown if the dominant +information is from a node itself or from the node's neighbours (or the rest of +the graph nodes). Therefore, there exists information lost at each layer of +embedding generation, and this information lost could be accumulated and become +more serious when more layers are used in the model. In this paper, we present +a differential encoding method to address the issue of information lost. The +idea of our method is to encode the differential representation between the +information from a node's neighbours (or the rest of the graph nodes) and that +from the node itself. The obtained differential encoding is then combined with +the original aggregated local or global representation to generate the updated +node embedding. By integrating differential encodings, the representational +ability of generated node embeddings is improved. The differential encoding +method is empirically evaluated on different graph tasks on seven benchmark +datasets. The results show that it is a general method that improves the +message-passing update and the global attention update, advancing the +state-of-the-art performance for graph representation learning on these +datasets. + +
+
+
+
+
+ + ☆ Highly Accelerated MRI via Implicit Neural Representation Guided + Posterior Sampling of Diffusion Models + + +
+ Reconstructing high-fidelity magnetic resonance (MR) images from +under-sampled k-space is a commonly used strategy to reduce scan time. The +posterior sampling of diffusion models based on the real measurement data holds +significant promise of improved reconstruction accuracy. However, traditional +posterior sampling methods often lack effective data consistency guidance, +leading to inaccurate and unstable reconstructions. Implicit neural +representation (INR) has emerged as a powerful paradigm for solving inverse +problems by modeling a signal's attributes as a continuous function of spatial +coordinates. In this study, we present a novel posterior sampler for diffusion +models using INR, named DiffINR. The INR-based component incorporates both the +diffusion prior distribution and the MRI physical model to ensure high data +fidelity. DiffINR demonstrates superior performance on experimental datasets +with remarkable accuracy, even under high acceleration factors (up to R=12 in +single-channel reconstruction). Notably, our proposed framework can be a +generalizable framework to solve inverse problems in other medical imaging +tasks. + +
+
+
+
+
+ + ☆ ZEAL: Surgical Skill Assessment with Zero-shot Tool Inference Using + Unified Foundation Model + + +
+ Surgical skill assessment is paramount for ensuring patient safety and +enhancing surgical outcomes. This study addresses the need for efficient and +objective evaluation methods by introducing ZEAL (surgical skill assessment +with Zero-shot surgical tool segmentation with a unifiEd foundAtion modeL). +ZEAL uses segmentation masks of surgical instruments obtained through a unified +foundation model for proficiency assessment. Through zero-shot inference with +text prompts, ZEAL predicts segmentation masks, capturing essential features of +both instruments and surroundings. Utilizing sparse convolutional neural +networks and segmentation masks, ZEAL extracts feature vectors for foreground +(instruments) and background. Long Short-Term Memory (LSTM) networks encode +temporal dynamics, modeling sequential data and dependencies in surgical +videos. Combining LSTM-encoded vectors, ZEAL produces a surgical skill score, +offering an objective measure of proficiency. Comparative analysis with +conventional methods using open datasets demonstrates ZEAL's superiority, +affirming its potential in advancing surgical training and evaluation. This +innovative approach to surgical skill assessment addresses challenges in +traditional supervised learning techniques, paving the way for enhanced +surgical care quality and patient outcomes. + +
+
+
+
+
+ + ☆ MedVH: Towards Systematic Evaluation of Hallucination for Large Vision + Language Models in the Medical Context + + +
+ Large Vision Language Models (LVLMs) have recently achieved superior +performance in various tasks on natural image and text data, which inspires a +large amount of studies for LVLMs fine-tuning and training. Despite their +advancements, there has been scant research on the robustness of these models +against hallucination when fine-tuned on smaller datasets. In this study, we +introduce a new benchmark dataset, the Medical Visual Hallucination Test +(MedVH), to evaluate the hallucination of domain-specific LVLMs. MedVH +comprises five tasks to evaluate hallucinations in LVLMs within the medical +context, which includes tasks for comprehensive understanding of textual and +visual input, as well as long textual response generation. Our extensive +experiments with both general and medical LVLMs reveal that, although medical +LVLMs demonstrate promising performance on standard medical tasks, they are +particularly susceptible to hallucinations, often more so than the general +models, raising significant concerns about the reliability of these +domain-specific models. For medical LVLMs to be truly valuable in real-world +applications, they must not only accurately integrate medical knowledge but +also maintain robust reasoning abilities to prevent hallucination. Our work +paves the way for future evaluations of these studies. + +
+
+
+
+
+ + ♻ ☆ Merlin:Empowering Multimodal LLMs with Foresight Minds ECCV2024 + + +
+ Humans possess the remarkable ability to foresee the future to a certain +extent based on present observations, a skill we term as foresight minds. +However, this capability remains largely under explored within existing +Multimodal Large Language Models (MLLMs), hindering their capacity to learn the +fundamental principles of how things operate and the intentions behind the +observed subjects. To address this issue, we introduce the integration of +future modeling into the existing learning frameworks of MLLMs. By utilizing +the subject trajectory, a highly structured representation of a consecutive +frame sequence, as a learning objective, we aim to bridge the gap between the +past and the future. We propose two innovative methods to empower MLLMs with +foresight minds, Foresight Pre-Training (FPT) and Foresight Instruction-Tuning +(FIT), which are inspired by the modern learning paradigm of LLMs. +Specifically, FPT jointly training various tasks centered on trajectories, +enabling MLLMs to learn how to attend and predict entire trajectories from a +given initial observation. Then, FIT requires MLLMs to first predict +trajectories of related objects and then reason about potential future events +based on them. Aided by FPT and FIT, we build a novel and unified MLLM named +Merlin that supports multi-images input and analysis about potential actions of +multiple objects for the future reasoning. Experimental results show Merlin +powerful foresight minds with impressive performance on both future reasoning +and visual comprehension tasks. + +
+
+ comment: Accepted by ECCV2024. Project page: https://ahnsun.github.io/merlin +
+
+
+
+
+ + ♻ ☆ Large-scale Pre-trained Models are Surprisingly Strong in Incremental + Novel Class Discovery ICPR 2024 + + +
+ Discovering novel concepts in unlabelled datasets and in a continuous manner +is an important desideratum of lifelong learners. In the literature such +problems have been partially addressed under very restricted settings, where +novel classes are learned by jointly accessing a related labelled set (e.g., +NCD) or by leveraging only a supervisedly pre-trained model (e.g., class-iNCD). +In this work we challenge the status quo in class-iNCD and propose a learning +paradigm where class discovery occurs continuously and truly unsupervisedly, +without needing any related labelled set. In detail, we propose to exploit the +richer priors from strong self-supervised pre-trained models (PTM). To this +end, we propose simple baselines, composed of a frozen PTM backbone and a +learnable linear classifier, that are not only simple to implement but also +resilient under longer learning scenarios. We conduct extensive empirical +evaluation on a multitude of benchmarks and show the effectiveness of our +proposed baselines when compared with sophisticated state-of-the-art methods. +The code is open source. + +
+
+ comment: Accepted as a conference paper to ICPR 2024 +
+
+
+
+
+ + ♻ ☆ DEEM: Diffusion Models Serve as the Eyes of Large Language Models for + Image Perception + + +
+ The development of large language models (LLMs) has significantly advanced +the emergence of large multimodal models (LMMs). While LMMs have achieved +tremendous success by promoting the synergy between multimodal comprehension +and creation, they often face challenges when confronted with +out-of-distribution data. This is primarily due to their reliance on image +encoders trained to encode images into task-relevant features, which may lead +them to disregard irrelevant details. Delving into the modeling capabilities of +diffusion models for images naturally prompts the question: Can diffusion +models serve as the eyes of large language models for image perception? In this +paper, we propose DEEM, a simple and effective approach that utilizes the +generative feedback of diffusion models to align the semantic distributions of +the image encoder. This addresses the drawbacks of previous methods that solely +relied on image encoders like ViT, thereby enhancing the model's resilience +against out-of-distribution samples and reducing visual hallucinations. +Importantly, this is achieved without requiring additional training modules and +with fewer training parameters. We extensively evaluated DEEM on both our newly +constructed RobustVQA benchmark and another well-known benchmark, POPE, for +object hallucination. Compared to the state-of-the-art interleaved content +generation models, DEEM exhibits enhanced robustness and a superior capacity to +alleviate model hallucinations while utilizing fewer trainable parameters, less +pre-training data (10%), and a smaller base model size. + +
+
+ comment: 25 pages. arXiv admin note: text overlap with arXiv:2401.10208 by + other authors +
+
+
+
+
+ + ♻ ☆ A Simple Baseline for Spoken Language to Sign Language Translation with + 3D Avatars ECCV 2024 + + +
+ The objective of this paper is to develop a functional system for translating +spoken languages into sign languages, referred to as Spoken2Sign translation. +The Spoken2Sign task is orthogonal and complementary to traditional sign +language to spoken language (Sign2Spoken) translation. To enable Spoken2Sign +translation, we present a simple baseline consisting of three steps: 1) +creating a gloss-video dictionary using existing Sign2Spoken benchmarks; 2) +estimating a 3D sign for each sign video in the dictionary; 3) training a +Spoken2Sign model, which is composed of a Text2Gloss translator, a sign +connector, and a rendering module, with the aid of the yielded gloss-3D sign +dictionary. The translation results are then displayed through a sign avatar. +As far as we know, we are the first to present the Spoken2Sign task in an +output format of 3D signs. In addition to its capability of Spoken2Sign +translation, we also demonstrate that two by-products of our approach-3D +keypoint augmentation and multi-view understanding-can assist in keypoint-based +sign language understanding. Code and models are available at +https://github.com/FangyunWei/SLRT. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ FairMedFM: Fairness Benchmarking for Medical Imaging Foundation Models + + +
+ The advent of foundation models (FMs) in healthcare offers unprecedented +opportunities to enhance medical diagnostics through automated classification +and segmentation tasks. However, these models also raise significant concerns +about their fairness, especially when applied to diverse and underrepresented +populations in healthcare applications. Currently, there is a lack of +comprehensive benchmarks, standardized pipelines, and easily adaptable +libraries to evaluate and understand the fairness performance of FMs in medical +imaging, leading to considerable challenges in formulating and implementing +solutions that ensure equitable outcomes across diverse patient populations. To +fill this gap, we introduce FairMedFM, a fairness benchmark for FM research in +medical imaging.FairMedFM integrates with 17 popular medical imaging datasets, +encompassing different modalities, dimensionalities, and sensitive attributes. +It explores 20 widely used FMs, with various usages such as zero-shot learning, +linear probing, parameter-efficient fine-tuning, and prompting in various +downstream tasks -- classification and segmentation. Our exhaustive analysis +evaluates the fairness performance over different evaluation metrics from +multiple perspectives, revealing the existence of bias, varied utility-fairness +trade-offs on different FMs, consistent disparities on the same datasets +regardless FMs, and limited effectiveness of existing unfairness mitigation +methods. Checkout FairMedFM's project page and open-sourced codebase, which +supports extendible functionalities and applications as well as inclusive for +studies on FMs in medical imaging over the long term. + +
+
+ comment: 29 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Implicit Concept Removal of Diffusion Models + + +
+ Text-to-image (T2I) diffusion models often inadvertently generate unwanted +concepts such as watermarks and unsafe images. These concepts, termed as the +"implicit concepts", could be unintentionally learned during training and then +be generated uncontrollably during inference. Existing removal methods still +struggle to eliminate implicit concepts primarily due to their dependency on +the model's ability to recognize concepts it actually can not discern. To +address this, we utilize the intrinsic geometric characteristics of implicit +concepts and present the Geom-Erasing, a novel concept removal method based on +the geometric-driven control. Specifically, once an unwanted implicit concept +is identified, we integrate the existence and geometric information of the +concept into the text prompts with the help of an accessible classifier or +detector model. Subsequently, the model is optimized to identify and +disentangle this information, which is then adopted as negative prompts during +generation. Moreover, we introduce the Implicit Concept Dataset (ICD), a novel +image-text dataset imbued with three typical implicit concepts (i.e., QR codes, +watermarks, and text), reflecting real-life situations where implicit concepts +are easily injected. Geom-Erasing effectively mitigates the generation of +implicit concepts, achieving the state-of-the-art results on the Inappropriate +Image Prompts (I2P) and our challenging Implicit Concept Dataset (ICD) +benchmarks. + +
+
+
+
+
+ + ♻ ☆ Depth Priors in Removal Neural Radiance Fields + + +
+ Neural Radiance Fields (NeRF) have achieved impressive results in 3D +reconstruction and novel view generation. A significant challenge within NeRF +involves editing reconstructed 3D scenes, such as object removal, which demands +consistency across multiple views and the synthesis of high-quality +perspectives. Previous studies have integrated depth priors, typically sourced +from LiDAR or sparse depth estimates from COLMAP, to enhance NeRF's performance +in object removal. However, these methods are either expensive or +time-consuming. This paper proposes a new pipeline that leverages SpinNeRF and +monocular depth estimation models like ZoeDepth to enhance NeRF's performance +in complex object removal with improved efficiency. A thorough evaluation of +COLMAP's dense depth reconstruction on the KITTI dataset is conducted to +demonstrate that COLMAP can be viewed as a cost-effective and scalable +alternative for acquiring depth ground truth compared to traditional methods +like LiDAR. This serves as the basis for evaluating the performance of +monocular depth estimation models to determine the best one for generating +depth priors for SpinNeRF. The new pipeline is tested in various scenarios +involving 3D reconstruction and object removal, and the results indicate that +our pipeline significantly reduces the time required for the acquisition of +depth priors for object removal and enhances the fidelity of the synthesized +views, suggesting substantial potential for building high-fidelity digital twin +systems with increased efficiency in the future. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ Multi-domain improves out-of-distribution and data-limited scenarios for + medical image analysis + + +
+ Current machine learning methods for medical image analysis primarily focus +on developing models tailored for their specific tasks, utilizing data within +their target domain. These specialized models tend to be data-hungry and often +exhibit limitations in generalizing to out-of-distribution samples. In this +work, we show that employing models that incorporate multiple domains instead +of specialized ones significantly alleviates the limitations observed in +specialized models. We refer to this approach as multi-domain model and compare +its performance to that of specialized models. For this, we introduce the +incorporation of diverse medical image domains, including different imaging +modalities like X-ray, MRI, CT, and ultrasound images, as well as various +viewpoints such as axial, coronal, and sagittal views. Our findings underscore +the superior generalization capabilities of multi-domain models, particularly +in scenarios characterized by limited data availability and +out-of-distribution, frequently encountered in healthcare applications. The +integration of diverse data allows multi-domain models to utilize information +across domains, enhancing the overall outcomes substantially. To illustrate, +for organ recognition, multi-domain model can enhance accuracy by up to 8% +compared to conventional specialized models. + +
+
+
+
+
+ + ♻ ☆ Out-of-distribution Detection in Medical Image Analysis: A survey + + +
+ Computer-aided diagnostics has benefited from the development of deep +learning-based computer vision techniques in these years. Traditional +supervised deep learning methods assume that the test sample is drawn from the +identical distribution as the training data. However, it is possible to +encounter out-of-distribution samples in real-world clinical scenarios, which +may cause silent failure in deep learning-based medical image analysis tasks. +Recently, research has explored various out-of-distribution (OOD) detection +situations and techniques to enable a trustworthy medical AI system. In this +survey, we systematically review the recent advances in OOD detection in +medical image analysis. We first explore several factors that may cause a +distributional shift when using a deep-learning-based model in clinic +scenarios, with three different types of distributional shift well defined on +top of these factors. Then a framework is suggested to categorize and feature +existing solutions, while the previous studies are reviewed based on the +methodology taxonomy. Our discussion also includes evaluation protocols and +metrics, as well as the challenge and a research direction lack of exploration. + +
+
+ comment: 23 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Developing a Novel Image Marker to Predict the Clinical Outcome of + Neoadjuvant Chemotherapy (NACT) for Ovarian Cancer Patients + + +
+ Objective Neoadjuvant chemotherapy (NACT) is one kind of treatment for +advanced stage ovarian cancer patients. However, due to the nature of tumor +heterogeneity, the clinical outcomes to NACT vary significantly among different +subgroups. Partial responses to NACT may lead to suboptimal debulking surgery, +which will result in adverse prognosis. To address this clinical challenge, the +purpose of this study is to develop a novel image marker to achieve high +accuracy prognosis prediction of NACT at an early stage. Methods For this +purpose, we first computed a total of 1373 radiomics features to quantify the +tumor characteristics, which can be grouped into three categories: geometric, +intensity, and texture features. Second, all these features were optimized by +principal component analysis algorithm to generate a compact and informative +feature cluster. This cluster was used as input for developing and optimizing +support vector machine (SVM) based classifiers, which indicated the likelihood +of receiving suboptimal cytoreduction after the NACT treatment. Two different +kernels for SVM algorithm were explored and compared. A total of 42 ovarian +cancer cases were retrospectively collected to validate the scheme. A nested +leave-one-out cross-validation framework was adopted for model performance +assessment. Results The results demonstrated that the model with a Gaussian +radial basis function kernel SVM yielded an AUC (area under the ROC [receiver +characteristic operation] curve) of 0.806. Meanwhile, this model achieved +overall accuracy (ACC) of 83.3%, positive predictive value (PPV) of 81.8%, and +negative predictive value (NPV) of 83.9%. Conclusion This study provides +meaningful information for the development of radiomics based image markers in +NACT treatment outcome prediction. + +
+
+
+
+
+ + ♻ ☆ Self-Cooperation Knowledge Distillation for Novel Class Discovery ECCV2024 + + +
+ Novel Class Discovery (NCD) aims to discover unknown and novel classes in an +unlabeled set by leveraging knowledge already learned about known classes. +Existing works focus on instance-level or class-level knowledge representation +and build a shared representation space to achieve performance improvements. +However, a long-neglected issue is the potential imbalanced number of samples +from known and novel classes, pushing the model towards dominant classes. +Therefore, these methods suffer from a challenging trade-off between reviewing +known classes and discovering novel classes. Based on this observation, we +propose a Self-Cooperation Knowledge Distillation (SCKD) method to utilize each +training sample (whether known or novel, labeled or unlabeled) for both review +and discovery. Specifically, the model's feature representations of known and +novel classes are used to construct two disjoint representation spaces. Through +spatial mutual information, we design a self-cooperation learning to encourage +model learning from the two feature representation spaces from itself. +Extensive experiments on six datasets demonstrate that our method can achieve +significant performance improvements, achieving state-of-the-art performance. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ Individual Tree Detection in Large-Scale Urban Environments using + High-Resolution Multispectral Imagery + + +
+ We introduce a novel deep learning method for detection of individual trees +in urban environments using high-resolution multispectral aerial imagery. We +use a convolutional neural network to regress a confidence map indicating the +locations of individual trees, which are localized using a peak finding +algorithm. Our method provides complete spatial coverage by detecting trees in +both public and private spaces, and can scale to very large areas. We performed +a thorough evaluation of our method, supported by a new dataset of over 1,500 +images and almost 100,000 tree annotations, covering eight cities, six climate +zones, and three image capture years. We trained our model on data from +Southern California, and achieved a precision of 73.6% and recall of 73.3% +using test data from this region. We generally observed similar precision and +slightly lower recall when extrapolating to other California climate zones and +image capture dates. We used our method to produce a map of trees in the entire +urban forest of California, and estimated the total number of urban trees in +California to be about 43.5 million. Our study indicates the potential for deep +learning methods to support future urban forestry studies at unprecedented +scales. + +
+
+
+
+
+ + ♻ ☆ Back to the Color: Learning Depth to Specific Color Transformation for + Unsupervised Depth Estimation + + +
+ Virtual engines can generate dense depth maps for various synthetic scenes, +making them invaluable for training depth estimation models. However, +discrepancies between synthetic and real-world colors pose significant +challenges for depth estimation in real-world scenes, especially in complex and +uncertain environments encountered in unsupervised monocular depth estimation +tasks. To address this issue, we propose Back2Color, a framework that predicts +realistic colors from depth using a model trained on real-world data, thus +transforming synthetic colors into their real-world counterparts. Additionally, +we introduce the Syn-Real CutMix method for joint training with both real-world +unsupervised and synthetic supervised depth samples, enhancing monocular depth +estimation performance in real-world scenes. Furthermore, to mitigate the +impact of non-rigid motions on depth estimation, we present an auto-learning +uncertainty temporal-spatial fusion method (Auto-UTSF), which leverages the +strengths of unsupervised learning in both temporal and spatial dimensions. We +also designed VADepth, based on the Vision Attention Network, which offers +lower computational complexity and higher accuracy than transformers. Our +Back2Color framework achieves state-of-the-art performance on the Kitti +dataset, as evidenced by improvements in performance metrics and the production +of fine-grained details. This is particularly evident on more challenging +datasets such as Cityscapes for unsupervised depth estimation. + +
+
+
+
+
+ + ♻ ☆ Multi-modal Attribute Prompting for Vision-Language Models + + +
+ Large pre-trained Vision-Language Models (VLMs), like CLIP, exhibit strong +generalization ability to downstream tasks but struggle in few-shot scenarios. +Existing prompting techniques primarily focus on global text and image +representations, yet overlooking multi-modal attribute characteristics. This +limitation hinders the model's ability to perceive fine-grained visual details +and restricts its generalization ability to a broader range of unseen classes. +To address this issue, we propose a Multi-modal Attribute Prompting method +(MAP) by jointly exploring textual attribute prompting, visual attribute +prompting, and attribute-level alignment. The proposed MAP enjoys several +merits. First, we introduce learnable visual attribute prompts enhanced by +textual attribute semantics to adaptively capture visual attributes for images +from unknown categories, boosting fine-grained visual perception capabilities +for CLIP. Second, the proposed attribute-level alignment complements the global +alignment to enhance the robustness of cross-modal alignment for +open-vocabulary objects. To our knowledge, this is the first work to establish +cross-modal attribute-level alignment for CLIP-based few-shot adaptation. +Extensive experimental results on 11 datasets demonstrate that our method +performs favorably against state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ Four Ways to Improve Verbo-visual Fusion for Dense 3D Visual Grounding ICCV 2023 + + +
+ 3D visual grounding is the task of localizing the object in a 3D scene which +is referred by a description in natural language. With a wide range of +applications ranging from autonomous indoor robotics to AR/VR, the task has +recently risen in popularity. A common formulation to tackle 3D visual +grounding is grounding-by-detection, where localization is done via bounding +boxes. However, for real-life applications that require physical interactions, +a bounding box insufficiently describes the geometry of an object. We therefore +tackle the problem of dense 3D visual grounding, i.e. referral-based 3D +instance segmentation. We propose a dense 3D grounding network ConcreteNet, +featuring four novel stand-alone modules that aim to improve grounding +performance for challenging repetitive instances, i.e. instances with +distractors of the same semantic class. First, we introduce a bottom-up +attentive fusion module that aims to disambiguate inter-instance relational +cues, next, we construct a contrastive training scheme to induce separation in +the latent space, we then resolve view-dependent utterances via a learned +global camera token, and finally we employ multi-view ensembling to improve +referred mask quality. ConcreteNet ranks 1st on the challenging ScanRefer +online benchmark and has won the ICCV 3rd Workshop on Language for 3D Scenes +"3D Object Localization" challenge. + +
+
+ comment: Winner of the ICCV 2023 ScanRefer Challenge. Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Federated Distillation for Medical Image Classification: Towards + Trustworthy Computer-Aided Diagnosis + + +
+ Medical image classification plays a crucial role in computer-aided clinical +diagnosis. While deep learning techniques have significantly enhanced +efficiency and reduced costs, the privacy-sensitive nature of medical imaging +data complicates centralized storage and model training. Furthermore, +low-resource healthcare organizations face challenges related to communication +overhead and efficiency due to increasing data and model scales. This paper +proposes a novel privacy-preserving medical image classification framework +based on federated learning to address these issues, named FedMIC. The +framework enables healthcare organizations to learn from both global and local +knowledge, enhancing local representation of private data despite statistical +heterogeneity. It provides customized models for organizations with diverse +data distributions while minimizing communication overhead and improving +efficiency without compromising performance. Our FedMIC enhances robustness and +practical applicability under resource-constrained conditions. We demonstrate +FedMIC's effectiveness using four public medical image datasets for classical +medical image classification tasks. + +
+
+ comment: Work in progress. This paper is the first to introduce intra-client + knowledge distillation in the context of trustworthy medical image + classification. arXiv admin note: text overlap with arXiv:2401.01493 +
+
+
+
+
+ + ♻ ☆ EarthMatch: Iterative Coregistration for Fine-grained Localization of + Astronaut Photography CVPR 2024 + + +
+ Precise, pixel-wise geolocalization of astronaut photography is critical to +unlocking the potential of this unique type of remotely sensed Earth data, +particularly for its use in disaster management and climate change research. +Recent works have established the Astronaut Photography Localization task, but +have either proved too costly for mass deployment or generated too coarse a +localization. Thus, we present EarthMatch, an iterative homography estimation +method that produces fine-grained localization of astronaut photographs while +maintaining an emphasis on speed. We refocus the astronaut photography +benchmark, AIMS, on the geolocalization task itself, and prove our method's +efficacy on this dataset. In addition, we offer a new, fair method for image +matcher comparison, and an extensive evaluation of different matching models +within our localization pipeline. Our method will enable fast and accurate +localization of the 4.5 million and growing collection of astronaut photography +of Earth. Webpage with code and data at +https://earthloc-and-earthmatch.github.io + +
+
+ comment: CVPR 2024 IMW - webpage: https://earthloc-and-earthmatch.github.io +
+
+
+
+
+ + ♻ ☆ BeNeRF: Neural Radiance Fields from a Single Blurry Image and Event + Stream ECCV 2024 + + +
+ Neural implicit representation of visual scenes has attracted a lot of +attention in recent research of computer vision and graphics. Most prior +methods focus on how to reconstruct 3D scene representation from a set of +images. In this work, we demonstrate the possibility to recover the neural +radiance fields (NeRF) from a single blurry image and its corresponding event +stream. We model the camera motion with a cubic B-Spline in SE(3) space. Both +the blurry image and the brightness change within a time interval, can then be +synthesized from the 3D scene representation given the 6-DoF poses interpolated +from the cubic B-Spline. Our method can jointly learn both the implicit neural +scene representation and recover the camera motion by minimizing the +differences between the synthesized data and the real measurements without +pre-computed camera poses from COLMAP. We evaluate the proposed method with +both synthetic and real datasets. The experimental results demonstrate that we +are able to render view-consistent latent sharp images from the learned NeRF +and bring a blurry image alive in high quality. Code and data are available at +https://github.com/WU-CVGL/BeNeRF. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ DialogGen: Multi-modal Interactive Dialogue System for Multi-turn + Text-to-Image Generation + + +
+ Text-to-image (T2I) generation models have significantly advanced in recent +years. However, effective interaction with these models is challenging for +average users due to the need for specialized prompt engineering knowledge and +the inability to perform multi-turn image generation, hindering a dynamic and +iterative creation process. Recent attempts have tried to equip Multi-modal +Large Language Models (MLLMs) with T2I models to bring the user's natural +language instructions into reality. Hence, the output modality of MLLMs is +extended, and the multi-turn generation quality of T2I models is enhanced +thanks to the strong multi-modal comprehension ability of MLLMs. However, many +of these works face challenges in identifying correct output modalities and +generating coherent images accordingly as the number of output modalities +increases and the conversations go deeper. Therefore, we propose DialogGen, an +effective pipeline to align off-the-shelf MLLMs and T2I models to build a +Multi-modal Interactive Dialogue System (MIDS) for multi-turn Text-to-Image +generation. It is composed of drawing prompt alignment, careful training data +curation, and error correction. Moreover, as the field of MIDS flourishes, +comprehensive benchmarks are urgently needed to evaluate MIDS fairly in terms +of output modality correctness and multi-modal output coherence. To address +this issue, we introduce the Multi-modal Dialogue Benchmark (DialogBen), a +comprehensive bilingual benchmark designed to assess the ability of MLLMs to +generate accurate and coherent multi-modal content that supports image editing. +It contains two evaluation metrics to measure the model's ability to switch +modalities and the coherence of the output images. Our extensive experiments on +DialogBen and user study demonstrate the effectiveness of DialogGen compared +with other State-of-the-Art models. + +
+
+ comment: Project page: https://hunyuan-dialoggen.github.io/ +
+
+
+
+
+ + ♻ ☆ ViG-Bias: Visually Grounded Bias Discovery and Mitigation ECCV 2024 + + +
+ The proliferation of machine learning models in critical decision making +processes has underscored the need for bias discovery and mitigation +strategies. Identifying the reasons behind a biased system is not +straightforward, since in many occasions they are associated with hidden +spurious correlations which are not easy to spot. Standard approaches rely on +bias audits performed by analyzing model performance in pre-defined subgroups +of data samples, usually characterized by common attributes like gender or +ethnicity when it comes to people, or other specific attributes defining +semantically coherent groups of images. However, it is not always possible to +know a-priori the specific attributes defining the failure modes of visual +recognition systems. Recent approaches propose to discover these groups by +leveraging large vision language models, which enable the extraction of +cross-modal embeddings and the generation of textual descriptions to +characterize the subgroups where a certain model is underperforming. In this +work, we argue that incorporating visual explanations (e.g. heatmaps generated +via GradCAM or other approaches) can boost the performance of such bias +discovery and mitigation frameworks. To this end, we introduce Visually +Grounded Bias Discovery and Mitigation (ViG-Bias), a simple yet effective +technique which can be integrated to a variety of existing frameworks to +improve both, discovery and mitigation performance. Our comprehensive +evaluation shows that incorporating visual explanations enhances existing +techniques like DOMINO, FACTS and Bias-to-Text, across several challenging +datasets, including CelebA, Waterbirds, and NICO++. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Semantic Segmentation via Marginal Contextual + Information + + +
+ We present a novel confidence refinement scheme that enhances pseudo labels +in semi-supervised semantic segmentation. Unlike existing methods, which filter +pixels with low-confidence predictions in isolation, our approach leverages the +spatial correlation of labels in segmentation maps by grouping neighboring +pixels and considering their pseudo labels collectively. With this contextual +information, our method, named S4MC, increases the amount of unlabeled data +used during training while maintaining the quality of the pseudo labels, all +with negligible computational overhead. Through extensive experiments on +standard benchmarks, we demonstrate that S4MC outperforms existing +state-of-the-art semi-supervised learning approaches, offering a promising +solution for reducing the cost of acquiring dense annotations. For example, +S4MC achieves a 1.39 mIoU improvement over the prior art on PASCAL VOC 12 with +366 annotated images. The code to reproduce our experiments is available at +https://s4mcontext.github.io/ + +
+
+ comment: Published at TMLR +
+
+
+
+
+ + ♻ ☆ Self-distilled Masked Attention guided masked image modeling with noise + Regularized Teacher (SMART) for medical image analysis + + +
+ Pretraining vision transformers (ViT) with attention guided masked image +modeling (MIM) has shown to increase downstream accuracy for natural image +analysis. Hierarchical shifted window (Swin) transformer, often used in medical +image analysis cannot use attention guided masking as it lacks an explicit +[CLS] token, needed for computing attention maps for selective masking. We thus +enhanced Swin with semantic class attention. We developed a co-distilled Swin +transformer that combines a noisy momentum updated teacher to guide selective +masking for MIM. Our approach called \textsc{s}e\textsc{m}antic +\textsc{a}ttention guided co-distillation with noisy teacher +\textsc{r}egularized Swin \textsc{T}rans\textsc{F}ormer (SMARTFormer) was +applied for analyzing 3D computed tomography datasets with lung nodules and +malignant lung cancers (LC). We also analyzed the impact of semantic attention +and noisy teacher on pretraining and downstream accuracy. SMARTFormer +classified lesions (malignant from benign) with a high accuracy of 0.895 of +1000 nodules, predicted LC treatment response with accuracy of 0.74, and +achieved high accuracies even in limited data regimes. Pretraining with +semantic attention and noisy teacher improved ability to distinguish +semantically meaningful structures such as organs in a unsupervised clustering +task and localize abnormal structures like tumors. Code, models will be made +available through GitHub upon paper acceptance. + +
+
+ comment: Paper is under review at TMI +
+
+
+
+
+ + ♻ ☆ Unsupervised Latent Stain Adaptation for Computational Pathology MICCAI2024 + + +
+ In computational pathology, deep learning (DL) models for tasks such as +segmentation or tissue classification are known to suffer from domain shifts +due to different staining techniques. Stain adaptation aims to reduce the +generalization error between different stains by training a model on source +stains that generalizes to target stains. Despite the abundance of target stain +data, a key challenge is the lack of annotations. To address this, we propose a +joint training between artificially labeled and unlabeled data including all +available stained images called Unsupervised Latent Stain Adaptation (ULSA). +Our method uses stain translation to enrich labeled source images with +synthetic target images in order to increase the supervised signals. Moreover, +we leverage unlabeled target stain images using stain-invariant feature +consistency learning. With ULSA we present a semi-supervised strategy for +efficient stain adaptation without access to annotated target stain data. +Remarkably, ULSA is task agnostic in patch-level analysis for whole slide +images (WSIs). Through extensive evaluation on external datasets, we +demonstrate that ULSA achieves state-of-the-art (SOTA) performance in kidney +tissue segmentation and breast cancer classification across a spectrum of +staining variations. Our findings suggest that ULSA is an important framework +for stain adaptation in computational pathology. + +
+
+ comment: Accepted MICCAI2024 +
+
+
+
+
+ + ♻ ☆ Mixture-of-Experts for Open Set Domain Adaptation: A Dual-Space + Detection Approach + + +
+ Open Set Domain Adaptation (OSDA) aims to cope with the distribution and +label shifts between the source and target domains simultaneously, performing +accurate classification for known classes while identifying unknown class +samples in the target domain. Most existing OSDA approaches, depending on the +final image feature space of deep models, require manually-tuned thresholds, +and may easily misclassify unknown samples as known classes. Mixture-of-Experts +(MoE) could be a remedy. Within a MoE, different experts handle distinct input +features, producing unique expert routing patterns for various classes in a +routing feature space. As a result, unknown class samples may display different +expert routing patterns to known classes. In this paper, we propose Dual-Space +Detection, which exploits the inconsistencies between the image feature space +and the routing feature space to detect unknown class samples without any +threshold. Graph Router is further introduced to better make use of the spatial +information among image patches. Experiments on three different datasets +validated the effectiveness and superiority of our approach. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ SparseSSP: 3D Subcellular Structure Prediction from Sparse-View + Transmitted Light Images ECCV2024 + + +
+ Traditional fluorescence staining is phototoxic to live cells, slow, and +expensive; thus, the subcellular structure prediction (SSP) from transmitted +light (TL) images is emerging as a label-free, faster, low-cost alternative. +However, existing approaches utilize 3D networks for one-to-one voxel level +dense prediction, which necessitates a frequent and time-consuming Z-axis +imaging process. Moreover, 3D convolutions inevitably lead to significant +computation and GPU memory overhead. Therefore, we propose an efficient +framework, SparseSSP, predicting fluorescent intensities within the target +voxel grid in an efficient paradigm instead of relying entirely on 3D +topologies. In particular, SparseSSP makes two pivotal improvements to prior +works. First, SparseSSP introduces a one-to-many voxel mapping paradigm, which +permits the sparse TL slices to reconstruct the subcellular structure. +Secondly, we propose a hybrid dimensions topology, which folds the Z-axis +information into channel features, enabling the 2D network layers to tackle SSP +under low computational cost. We conduct extensive experiments to validate the +effectiveness and advantages of SparseSSP on diverse sparse imaging ratios, and +our approach achieves a leading performance compared to pure 3D topologies. +SparseSSP reduces imaging frequencies compared to previous dense-view SSP +(i.e., the number of imaging is reduced up to 87.5% at most), which is +significant in visualizing rapid biological dynamics on low-cost devices and +samples. + +
+
+ comment: Accpeted to ECCV2024 +
+
+
+
+
+ + ♻ ☆ I-MedSAM: Implicit Medical Image Segmentation with Segment Anything ECCV2024 + + +
+ With the development of Deep Neural Networks (DNNs), many efforts have been +made to handle medical image segmentation. Traditional methods such as nnUNet +train specific segmentation models on the individual datasets. Plenty of recent +methods have been proposed to adapt the foundational Segment Anything Model +(SAM) to medical image segmentation. However, they still focus on discrete +representations to generate pixel-wise predictions, which are spatially +inflexible and scale poorly to higher resolution. In contrast, implicit methods +learn continuous representations for segmentation, which is crucial for medical +image segmentation. In this paper, we propose I-MedSAM, which leverages the +benefits of both continuous representations and SAM, to obtain better +cross-domain ability and accurate boundary delineation. Since medical image +segmentation needs to predict detailed segmentation boundaries, we designed a +novel adapter to enhance the SAM features with high-frequency information +during Parameter-Efficient Fine-Tuning (PEFT). To convert the SAM features and +coordinates into continuous segmentation output, we utilize Implicit Neural +Representation (INR) to learn an implicit segmentation decoder. We also propose +an uncertainty-guided sampling strategy for efficient learning of INR. +Extensive evaluations on 2D medical image segmentation tasks have shown that +our proposed method with only 1.6M trainable parameters outperforms existing +methods including discrete and implicit methods. The code will be available at: +https://github.com/ucwxb/I-MedSAM. + +
+
+ comment: accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ Defect Spectrum: A Granular Look of Large-Scale Defect Datasets with + Rich Semantics ECCV2024 + + +
+ Defect inspection is paramount within the closed-loop manufacturing system. +However, existing datasets for defect inspection often lack precision and +semantic granularity required for practical applications. In this paper, we +introduce the Defect Spectrum, a comprehensive benchmark that offers precise, +semantic-abundant, and large-scale annotations for a wide range of industrial +defects. Building on four key industrial benchmarks, our dataset refines +existing annotations and introduces rich semantic details, distinguishing +multiple defect types within a single image. Furthermore, we introduce +Defect-Gen, a two-stage diffusion-based generator designed to create +high-quality and diverse defective images, even when working with limited +datasets. The synthetic images generated by Defect-Gen significantly enhance +the efficacy of defect inspection models. Overall, The Defect Spectrum dataset +demonstrates its potential in defect inspection research, offering a solid +platform for testing and refining advanced models. + +
+
+ comment: Accepted by ECCV2024. Please see our project page at + https://envision-research.github.io/Defect_Spectrum/ +
+
+
+
+
+ + ♻ ☆ BLINK: Multimodal Large Language Models Can See but Not Perceive ECCV 2024 + + +
+ We introduce Blink, a new benchmark for multimodal language models (LLMs) +that focuses on core visual perception abilities not found in other +evaluations. Most of the Blink tasks can be solved by humans "within a blink" +(e.g., relative depth estimation, visual correspondence, forensics detection, +and multi-view reasoning). However, we find these perception-demanding tasks +cast significant challenges for current multimodal LLMs because they resist +mediation through natural language. Blink reformats 14 classic computer vision +tasks into 3,807 multiple-choice questions, paired with single or multiple +images and visual prompting. While humans get 95.70% accuracy on average, Blink +is surprisingly challenging for existing multimodal LLMs: even the +best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only +13.17% and 7.63% higher than random guessing, indicating that such perception +abilities have not "emerged" yet in recent multimodal LLMs. Our analysis also +highlights that specialist CV models could solve these problems much better, +suggesting potential pathways for future improvements. We believe Blink will +stimulate the community to help multimodal LLMs catch up with human-level +visual perception. + +
+
+ comment: Multimodal Benchmark, Project Url: https://zeyofu.github.io/blink/, + ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Investigating Event-Based Cameras for Video Frame Interpolation in + Sports + + +
+ Slow-motion replays provide a thrilling perspective on pivotal moments within +sports games, offering a fresh and captivating visual experience. However, +capturing slow-motion footage typically demands high-tech, expensive cameras +and infrastructures. Deep learning Video Frame Interpolation (VFI) techniques +have emerged as a promising avenue, capable of generating high-speed footage +from regular camera feeds. Moreover, the utilization of event-based cameras has +recently gathered attention as they provide valuable motion information between +frames, further enhancing the VFI performances. In this work, we present a +first investigation of event-based VFI models for generating sports slow-motion +videos. Particularly, we design and implement a bi-camera recording setup, +including an RGB and an event-based camera to capture sports videos, to +temporally align and spatially register both cameras. Our experimental +validation demonstrates that TimeLens, an off-the-shelf event-based VFI model, +can effectively generate slow-motion footage for sports videos. This first +investigation underscores the practical utility of event-based cameras in +producing sports slow-motion content and lays the groundwork for future +research endeavors in this domain. + +
+
+
+
+
+ + ♻ ☆ Explainable AI for Safe and Trustworthy Autonomous Driving: A Systematic + Review + + +
+ Artificial Intelligence (AI) shows promising applications for the perception +and planning tasks in autonomous driving (AD) due to its superior performance +compared to conventional methods. However, inscrutable AI systems exacerbate +the existing challenge of safety assurance of AD. One way to mitigate this +challenge is to utilize explainable AI (XAI) techniques. To this end, we +present the first comprehensive systematic literature review of explainable +methods for safe and trustworthy AD. We begin by analyzing the requirements for +AI in the context of AD, focusing on three key aspects: data, model, and +agency. We find that XAI is fundamental to meeting these requirements. Based on +this, we explain the sources of explanations in AI and describe a taxonomy of +XAI. We then identify five key contributions of XAI for safe and trustworthy AI +in AD, which are interpretable design, interpretable surrogate models, +interpretable monitoring, auxiliary explanations, and interpretable validation. +Finally, we propose a modular framework called SafeX to integrate these +contributions, enabling explanation delivery to users while simultaneously +ensuring the safety of AI models. + +
+
+
+
+
+ + ♻ ☆ Nuisances via Negativa: Adjusting for Spurious Correlations via Data + Augmentation + + +
+ In prediction tasks, there exist features that are related to the label in +the same way across different settings for that task; these are semantic +features or semantics. Features with varying relationships to the label are +nuisances. For example, in detecting cows from natural images, the shape of the +head is semantic but because images of cows often have grass backgrounds but +not always, the background is a nuisance. Models that exploit nuisance-label +relationships face performance degradation when these relationships change. +Building models robust to such changes requires additional knowledge beyond +samples of the features and labels. For example, existing work uses annotations +of nuisances or assumes ERM-trained models depend on nuisances. Approaches to +integrate new kinds of additional knowledge enlarge the settings where robust +models can be built. We develop an approach to use knowledge about the +semantics by corrupting them in data, and then using the corrupted data to +produce models which identify correlations between nuisances and the label. +Once these correlations are identified, they can be used to adjust for where +nuisances drive predictions. We study semantic corruptions in powering +different spurious-correlation avoiding methods on multiple out-of-distribution +(OOD) tasks like classifying waterbirds, natural language inference (NLI), and +detecting cardiomegaly in chest X-rays. + +
+
+
+
+
+ + ♻ ☆ STAR: A First-Ever Dataset and A Large-Scale Benchmark for Scene Graph + Generation in Large-Size Satellite Imagery + + +
+ Scene graph generation (SGG) in satellite imagery (SAI) benefits promoting +understanding of geospatial scenarios from perception to cognition. In SAI, +objects exhibit great variations in scales and aspect ratios, and there exist +rich relationships between objects (even between spatially disjoint objects), +which makes it attractive to holistically conduct SGG in large-size +very-high-resolution (VHR) SAI. However, there lack such SGG datasets. Due to +the complexity of large-size SAI, mining triplets heavily relies on long-range contextual reasoning. Consequently, SGG +models designed for small-size natural imagery are not directly applicable to +large-size SAI. This paper constructs a large-scale dataset for SGG in +large-size VHR SAI with image sizes ranging from 512 x 768 to 27,860 x 31,096 +pixels, named STAR (Scene graph generaTion in lArge-size satellite imageRy), +encompassing over 210K objects and over 400K triplets. To realize SGG in +large-size SAI, we propose a context-aware cascade cognition (CAC) framework to +understand SAI regarding object detection (OBD), pair pruning and relationship +prediction for SGG. We also release a SAI-oriented SGG toolkit with about 30 +OBD and 10 SGG methods which need further adaptation by our devised modules on +our challenging STAR dataset. The dataset and toolkit are available at: +https://linlin-dev.github.io/project/STAR. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ EVF-SAM: Early Vision-Language Fusion for Text-Prompted Segment Anything + Model + + +
+ Segment Anything Model (SAM) has attracted widespread attention for its +superior interactive segmentation capabilities with visual prompts while +lacking further exploration of text prompts. In this paper, we empirically +investigate what text prompt encoders (e.g., CLIP or LLM) are good for adapting +SAM for referring expression segmentation and introduce the Early +Vision-language Fusion-based SAM (EVF-SAM). EVF-SAM is a simple yet effective +referring segmentation method which exploits multimodal prompts (i.e., image +and text) and comprises a pre-trained vision-language model to generate +referring prompts and a SAM model for segmentation. Surprisingly, we observe +that: (1) multimodal prompts and (2) vision-language models with early fusion +(e.g., BEIT-3) are beneficial for prompting SAM for accurate referring +segmentation. Our experiments show that the proposed EVF-SAM based on BEIT-3 +can obtain state-of-the-art performance on RefCOCO/+/g for referring expression +segmentation and demonstrate the superiority of prompting SAM with early +vision-language fusion. In addition, the proposed EVF-SAM with 1.32B parameters +achieves remarkably higher performance while reducing nearly 82% of parameters +compared to previous SAM methods based on large multimodal models. + +
+
+ comment: Preprint. Code and models are available at: + https://github.com/hustvl/EVF-SAM +
+
+
+
+
+ + ♻ ☆ RIDGE: Reproducibility, Integrity, Dependability, Generalizability, and + Efficiency Assessment of Medical Image Segmentation Models + + +
+ Deep learning techniques hold immense promise for advancing medical image +analysis, particularly in tasks like image segmentation, where precise +annotation of regions or volumes of interest within medical images is crucial +but manually laborious and prone to interobserver and intraobserver biases. As +such, deep learning approaches could provide automated solutions for such +applications. However, the potential of these techniques is often undermined by +challenges in reproducibility and generalizability, which are key barriers to +their clinical adoption. This paper introduces the RIDGE checklist, a +comprehensive framework designed to assess the Reproducibility, Integrity, +Dependability, Generalizability, and Efficiency of deep learning-based medical +image segmentation models. The RIDGE checklist is not just a tool for +evaluation but also a guideline for researchers striving to improve the quality +and transparency of their work. By adhering to the principles outlined in the +RIDGE checklist, researchers can ensure that their developed segmentation +models are robust, scientifically valid, and applicable in a clinical setting. + +
+
+ comment: 24 pages, 1 Figure, 2 Table +
+
+
+
+
+ + ♻ ☆ How Deep Neural Networks Learn Compositional Data: The Random Hierarchy + Model + + +
+ Deep learning algorithms demonstrate a surprising ability to learn +high-dimensional tasks from limited examples. This is commonly attributed to +the depth of neural networks, enabling them to build a hierarchy of abstract, +low-dimensional data representations. However, how many training examples are +required to learn such representations remains unknown. To quantitatively study +this question, we introduce the Random Hierarchy Model: a family of synthetic +tasks inspired by the hierarchical structure of language and images. The model +is a classification task where each class corresponds to a group of high-level +features, chosen among several equivalent groups associated with the same +class. In turn, each feature corresponds to a group of sub-features chosen +among several equivalent ones and so on, following a hierarchy of composition +rules. We find that deep networks learn the task by developing internal +representations invariant to exchanging equivalent groups. Moreover, the number +of data required corresponds to the point where correlations between low-level +features and classes become detectable. Overall, our results indicate how deep +networks overcome the curse of dimensionality by building invariant +representations, and provide an estimate of the number of data required to +learn a hierarchical task. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ A Systematic Performance Analysis of Deep Perceptual Loss Networks: + Breaking Transfer Learning Conventions + + +
+ In recent years, deep perceptual loss has been widely and successfully used +to train machine learning models for many computer vision tasks, including +image synthesis, segmentation, and autoencoding. Deep perceptual loss is a type +of loss function for images that computes the error between two images as the +distance between deep features extracted from a neural network. Most +applications of the loss use pretrained networks called loss networks for deep +feature extraction. However, despite increasingly widespread use, the effects +of loss network implementation on the trained models have not been studied. + This work rectifies this through a systematic evaluation of the effect of +different pretrained loss networks on four different application areas. +Specifically, the work evaluates 14 different pretrained architectures with +four different feature extraction layers. The evaluation reveals that VGG +networks without batch normalization have the best performance and that the +choice of feature extraction layer is at least as important as the choice of +architecture. The analysis also reveals that deep perceptual loss does not +adhere to the transfer learning conventions that better ImageNet accuracy +implies better downstream performance and that feature extraction from the +later layers provides better performance. + +
+
+
+
+
+ + ♻ ☆ Character-Adapter: Prompt-Guided Region Control for High-Fidelity + Character Customization + + +
+ Customized image generation, which seeks to synthesize images with consistent +characters, holds significant relevance for applications such as storytelling, +portrait generation, and character design. However, previous approaches have +encountered challenges in preserving characters with high-fidelity consistency +due to inadequate feature extraction and concept confusion of reference +characters. Therefore, we propose Character-Adapter, a plug-and-play framework +designed to generate images that preserve the details of reference characters, +ensuring high-fidelity consistency. Character-Adapter employs prompt-guided +segmentation to ensure fine-grained regional features of reference characters +and dynamic region-level adapters to mitigate concept confusion. Extensive +experiments are conducted to validate the effectiveness of Character-Adapter. +Both quantitative and qualitative results demonstrate that Character-Adapter +achieves the state-of-the-art performance of consistent character generation, +with an improvement of 24.8% compared with other methods. Our code will be +released at https://github.com/Character-Adapter/Character-Adapter + +
+
+
+
+
+ + ♻ ☆ PaPr: Training-Free One-Step Patch Pruning with Lightweight ConvNets for + Faster Inference ECCV 2024 + + +
+ As deep neural networks evolve from convolutional neural networks (ConvNets) +to advanced vision transformers (ViTs), there is an increased need to eliminate +redundant data for faster processing without compromising accuracy. Previous +methods are often architecture-specific or necessitate re-training, restricting +their applicability with frequent model updates. To solve this, we first +introduce a novel property of lightweight ConvNets: their ability to identify +key discriminative patch regions in images, irrespective of model's final +accuracy or size. We demonstrate that fully-connected layers are the primary +bottleneck for ConvNets performance, and their suppression with simple weight +recalibration markedly enhances discriminative patch localization performance. +Using this insight, we introduce PaPr, a method for substantially pruning +redundant patches with minimal accuracy loss using lightweight ConvNets across +a variety of deep learning architectures, including ViTs, ConvNets, and hybrid +transformers, without any re-training. Moreover, the simple early-stage +one-step patch pruning with PaPr enhances existing patch reduction methods. +Through extensive testing on diverse architectures, PaPr achieves significantly +higher accuracy over state-of-the-art patch reduction methods with similar FLOP +count reduction. More specifically, PaPr reduces about 70% of redundant patches +in videos with less than 0.8% drop in accuracy, and up to 3.7x FLOPs reduction, +which is a 15% more reduction with 2.5% higher accuracy. Code is released at +https://github.com/tanvir-utexas/PaPr. + +
+
+ comment: Accepted in ECCV 2024. Code: https://github.com/tanvir-utexas/PaPr +
+
+
+
+
+ + ♻ ☆ Self-Supervised Detection of Perfect and Partial Input-Dependent + Symmetries + + +
+ Group equivariance can overly constrain models if the symmetries in the group +differ from those observed in data. While common methods address this by +determining the appropriate level of symmetry at the dataset level, they are +limited to supervised settings and ignore scenarios in which multiple levels of +symmetry co-exist in the same dataset. In this paper, we propose a method able +to detect the level of symmetry of each input without the need for labels. Our +framework is general enough to accommodate different families of both +continuous and discrete symmetry distributions, such as arbitrary unimodal, +symmetric distributions and discrete groups. We validate the effectiveness of +our approach on synthetic datasets with different per-class levels of +symmetries, and demonstrate practical applications such as the detection of +out-of-distribution symmetries. Our code is publicly available at +https://github.com/aurban0/ssl-sym. + +
+
+
+
+
+ + ♻ ☆ FedIA: Federated Medical Image Segmentation with Heterogeneous + Annotation Completeness MICCAI 2024 + + +
+ Federated learning has emerged as a compelling paradigm for medical image +segmentation, particularly in light of increasing privacy concerns. However, +most of the existing research relies on relatively stringent assumptions +regarding the uniformity and completeness of annotations across clients. +Contrary to this, this paper highlights a prevalent challenge in medical +practice: incomplete annotations. Such annotations can introduce incorrectly +labeled pixels, potentially undermining the performance of neural networks in +supervised learning. To tackle this issue, we introduce a novel solution, named +FedIA. Our insight is to conceptualize incomplete annotations as noisy data +(i.e., low-quality data), with a focus on mitigating their adverse effects. We +begin by evaluating the completeness of annotations at the client level using a +designed indicator. Subsequently, we enhance the influence of clients with more +comprehensive annotations and implement corrections for incomplete ones, +thereby ensuring that models are trained on accurate data. Our method's +effectiveness is validated through its superior performance on two extensively +used medical image segmentation datasets, outperforming existing solutions. The +code is available at https://github.com/HUSTxyy/FedIA. + +
+
+ comment: Early accepted by MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Meerkat: Audio-Visual Large Language Model for Grounding in Space and + Time ECCV 2024 + + +
+ Leveraging Large Language Models' remarkable proficiency in text-based tasks, +recent works on Multi-modal LLMs (MLLMs) extend them to other modalities like +vision and audio. However, the progress in these directions has been mostly +focused on tasks that only require a coarse-grained understanding of the +audio-visual semantics. We present Meerkat, an audio-visual LLM equipped with a +fine-grained understanding of image and audio both spatially and temporally. +With a new modality alignment module based on optimal transport and a +cross-attention module that enforces audio-visual consistency, Meerkat can +tackle challenging tasks such as audio referred image grounding, image guided +audio temporal localization, and audio-visual fact-checking. Moreover, we +carefully curate a large dataset AVFIT that comprises 3M instruction tuning +samples collected from open-source datasets, and introduce MeerkatBench that +unifies five challenging audio-visual tasks. We achieve state-of-the-art +performance on all these downstream tasks with a relative improvement of up to +37.12%. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Language-Guided Face Animation by Recurrent StyleGAN-based Generator + + +
+ Recent works on language-guided image manipulation have shown great power of +language in providing rich semantics, especially for face images. However, the +other natural information, motions, in language is less explored. In this +paper, we leverage the motion information and study a novel task, +language-guided face animation, that aims to animate a static face image with +the help of languages. To better utilize both semantics and motions from +languages, we propose a simple yet effective framework. Specifically, we +propose a recurrent motion generator to extract a series of semantic and motion +information from the language and feed it along with visual information to a +pre-trained StyleGAN to generate high-quality frames. To optimize the proposed +framework, three carefully designed loss functions are proposed including a +regularization loss to keep the face identity, a path length regularization +loss to ensure motion smoothness, and a contrastive loss to enable video +synthesis with various language guidance in one single model. Extensive +experiments with both qualitative and quantitative evaluations on diverse +domains (\textit{e.g.,} human face, anime face, and dog face) demonstrate the +superiority of our model in generating high-quality and realistic videos from +one still image with the guidance of language. Code will be available at +https://github.com/TiankaiHang/language-guided-animation.git. + +
+
+
+
+
+ + ♻ ☆ Chemical Shift Encoding based Double Bonds Quantification in + Triglycerides using Deep Image Prior + + +
+ This study evaluated a deep learning-based method using Deep Image Prior +(DIP) to quantify triglyceride double bonds from chemical-shift encoded +multi-echo gradient echo images without network training. We employed a cost +function based on signal constraints to iteratively update the neural network +on a single dataset. The method was validated using phantom experiments and in +vivo scans. Results showed close alignment between measured and reference +double bond values, with phantom experiments yielding a Pearson correlation +coefficient of 0.96 (p = .0005). In vivo results demonstrated good agreement in +subcutaneous fat. We conclude that Deep Image Prior shows feasibility for +quantifying double bonds and fatty acid content from chemical-shift encoded +multi-echo MRI. + +
+
+
+
+
+ + ♻ ☆ Accelerating Diffusion Sampling with Optimized Time Steps CVPR 2024 + + +
+ Diffusion probabilistic models (DPMs) have shown remarkable performance in +high-resolution image synthesis, but their sampling efficiency is still to be +desired due to the typically large number of sampling steps. Recent +advancements in high-order numerical ODE solvers for DPMs have enabled the +generation of high-quality images with much fewer sampling steps. While this is +a significant development, most sampling methods still employ uniform time +steps, which is not optimal when using a small number of steps. To address this +issue, we propose a general framework for designing an optimization problem +that seeks more appropriate time steps for a specific numerical ODE solver for +DPMs. This optimization problem aims to minimize the distance between the +ground-truth solution to the ODE and an approximate solution corresponding to +the numerical solver. It can be efficiently solved using the constrained trust +region method, taking less than $15$ seconds. Our extensive experiments on both +unconditional and conditional sampling using pixel- and latent-space DPMs +demonstrate that, when combined with the state-of-the-art sampling method +UniPC, our optimized time steps significantly improve image generation +performance in terms of FID scores for datasets such as CIFAR-10 and ImageNet, +compared to using uniform time steps. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Self-supervised co-salient object detection via feature correspondence + at multiple scales ECCV 2024 + + +
+ Our paper introduces a novel two-stage self-supervised approach for detecting +co-occurring salient objects (CoSOD) in image groups without requiring +segmentation annotations. Unlike existing unsupervised methods that rely solely +on patch-level information (e.g. clustering patch descriptors) or on +computation heavy off-the-shelf components for CoSOD, our lightweight model +leverages feature correspondences at both patch and region levels, +significantly improving prediction performance. In the first stage, we train a +self-supervised network that detects co-salient regions by computing local +patch-level feature correspondences across images. We obtain the segmentation +predictions using confidence-based adaptive thresholding. In the next stage, we +refine these intermediate segmentations by eliminating the detected regions +(within each image) whose averaged feature representations are dissimilar to +the foreground feature representation averaged across all the cross-attention +maps (from the previous stage). Extensive experiments on three CoSOD benchmark +datasets show that our self-supervised model outperforms the corresponding +state-of-the-art models by a huge margin (e.g. on the CoCA dataset, our model +has a 13.7% F-measure gain over the SOTA unsupervised CoSOD model). Notably, +our self-supervised model also outperforms several recent fully supervised +CoSOD models on the three test datasets (e.g., on the CoCA dataset, our model +has a 4.6% F-measure gain over a recent supervised CoSOD model). + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ The Unmet Promise of Synthetic Training Images: Using Retrieved Real + Images Performs Better + + +
+ Generative text-to-image models enable us to synthesize unlimited amounts of +images in a controllable manner, spurring many recent efforts to train vision +models with synthetic data. However, every synthetic image ultimately +originates from the upstream data used to train the generator. What additional +value does the intermediate generator provide over directly training on +relevant parts of the upstream data? Grounding this question in the setting of +image classification,a we compare finetuning on task-relevant, targeted +synthetic data generated by Stable Diffusion -- a generative model trained on +the LAION-2B dataset -- against finetuning on targeted real images retrieved +directly from LAION-2B. We show that while synthetic data can benefit some +downstream tasks, it is universally matched or outperformed by real data from +our simple retrieval baseline. Our analysis suggests that this underperformance +is partially due to generator artifacts and inaccurate task-relevant visual +details in the synthetic images. Overall, we argue that retrieval is a critical +baseline to consider when training with synthetic data -- a baseline that +current methods do not yet surpass. We release code, data, and models at +https://github.com/scottgeng00/unmet-promise. + +
+
+ comment: Correspondence to sgeng at cs dot washington dot edu. RK and PWK + equally advised the project +
+
+
+
+
+ + ♻ ☆ Similarity Distance-Based Label Assignment for Tiny Object Detection + + +
+ Tiny object detection is becoming one of the most challenging tasks in +computer vision because of the limited object size and lack of information. The +label assignment strategy is a key factor affecting the accuracy of object +detection. Although there are some effective label assignment strategies for +tiny objects, most of them focus on reducing the sensitivity to the bounding +boxes to increase the number of positive samples and have some fixed +hyperparameters need to set. However, more positive samples may not necessarily +lead to better detection results, in fact, excessive positive samples may lead +to more false positives. In this paper, we introduce a simple but effective +strategy named the Similarity Distance (SimD) to evaluate the similarity +between bounding boxes. This proposed strategy not only considers both location +and shape similarity but also learns hyperparameters adaptively, ensuring that +it can adapt to different datasets and various object sizes in a dataset. Our +approach can be simply applied in common anchor-based detectors in place of the +IoU for label assignment and Non Maximum Suppression (NMS). Extensive +experiments on four mainstream tiny object detection datasets demonstrate +superior performance of our method, especially, 1.8 AP points and 4.1 AP points +of very tiny higher than the state-of-the-art competitors on AI-TOD. Code is +available at: \url{https://github.com/cszzshi/SimD}. + +
+
+ comment: 8 pages, 4 figures, this paper has been accepted by IEEE/RSJ + International Conference on Intelligent Robots and Systems +
+
+
+
+
+ + ♻ ☆ Swish-T : Enhancing Swish Activation with Tanh Bias for Improved Neural + Network Performance + + +
+ We propose the Swish-T family, an enhancement of the existing non-monotonic +activation function Swish. Swish-T is defined by adding a Tanh bias to the +original Swish function. This modification creates a family of Swish-T +variants, each designed to excel in different tasks, showcasing specific +advantages depending on the application context. The Tanh bias allows for +broader acceptance of negative values during initial training stages, offering +a smoother non-monotonic curve than the original Swish. We ultimately propose +the Swish-T$_{\textbf{C}}$ function, while Swish-T and Swish-T$_{\textbf{B}}$, +byproducts of Swish-T$_{\textbf{C}}$, also demonstrate satisfactory +performance. Furthermore, our ablation study shows that using +Swish-T$_{\textbf{C}}$ as a non-parametric function can still achieve high +performance. The superiority of the Swish-T family has been empirically +demonstrated across various models and benchmark datasets, including MNIST, +Fashion MNIST, SVHN, CIFAR-10, and CIFAR-100. The code is publicly available at +https://github.com/ictseoyoungmin/Swish-T-pytorch. + +
+
+ comment: 11 pages, 6 figures Revised the derivative of the sigmoid function + from 1-sigmoid to sigmoid(1-sigmoid) for correctness.Updated related + equations in Section 3.2. Conclusions to Conclusion in Section 6 +
+
+
+
+
+ + ♻ ☆ Solving the Inverse Problem of Electrocardiography for Cardiac Digital + Twins: A Survey + + +
+ Cardiac digital twins are personalized virtual representations used to +understand complex heart mechanisms. Solving the ECG inverse problem is crucial +for accurate virtual heart modelling, enabling the derivation of internal +electrical activity information from recorded surface potentials. Despite +challenges from cardiac complexity, noisy ECG data, and computational +efficiency, recent advancements hold significant promise for enhancing virtual +heart modelling, ultimately advancing precision medicine in cardiology. This +paper aims to provide a comprehensive review of the methods of solving ECG +inverse problem, the validation strategies, the clinical applications, and +future perspectives. For the computing methodologies, we broadly classify +state-of-the-art approaches into two categories: deterministic and +probabilistic methods, including conventional and deep learning-based +techniques. Integrating physics laws with deep learning models holds promise, +but challenges such as capturing dynamic electrophysiology accurately, +accessing accurate domain knowledge, and quantifying prediction uncertainty +persist. Integrating models into clinical workflows while ensuring +interpretability and usability for healthcare professionals is essential. +Overcoming these challenges will drive further research in cardiac digital +twins. + +
+
+
+
+
+ + ♻ ☆ Can 3D Vision-Language Models Truly Understand Natural Language? + + +
+ Rapid advancements in 3D vision-language (3D-VL) tasks have opened up new +avenues for human interaction with embodied agents or robots using natural +language. Despite this progress, we find a notable limitation: existing 3D-VL +models exhibit sensitivity to the styles of language input, struggling to +understand sentences with the same semantic meaning but written in different +variants. This observation raises a critical question: Can 3D vision-language +models truly understand natural language? To test the language +understandability of 3D-VL models, we first propose a language robustness task +for systematically assessing 3D-VL models across various tasks, benchmarking +their performance when presented with different language style variants. +Importantly, these variants are commonly encountered in applications requiring +direct interaction with humans, such as embodied robotics, given the diversity +and unpredictability of human language. We propose a 3D Language Robustness +Dataset, designed based on the characteristics of human language, to facilitate +the systematic study of robustness. Our comprehensive evaluation uncovers a +significant drop in the performance of all existing models across various 3D-VL +tasks. Even the state-of-the-art 3D-LLM fails to understand some variants of +the same sentences. Further in-depth analysis suggests that the existing models +have a fragile and biased fusion module, which stems from the low diversity of +the existing dataset. Finally, we propose a training-free module driven by LLM, +which improves language robustness. Datasets and code will be available at +github. + +
+
+ comment: https://github.com/VincentDENGP/3D-LR +
+
+
+
+
+ + ♻ ☆ ColorizeDiffusion: Adjustable Sketch Colorization with Reference Image + and Text + + +
+ Diffusion models have recently demonstrated their effectiveness in generating +extremely high-quality images and are now utilized in a wide range of +applications, including automatic sketch colorization. Although many methods +have been developed for guided sketch colorization, there has been limited +exploration of the potential conflicts between image prompts and sketch inputs, +which can lead to severe deterioration in the results. Therefore, this paper +exhaustively investigates reference-based sketch colorization models that aim +to colorize sketch images using reference color images. We specifically +investigate two critical aspects of reference-based diffusion models: the +"distribution problem", which is a major shortcoming compared to text-based +counterparts, and the capability in zero-shot sequential text-based +manipulation. We introduce two variations of an image-guided latent diffusion +model utilizing different image tokens from the pre-trained CLIP image encoder +and propose corresponding manipulation methods to adjust their results +sequentially using weighted text inputs. We conduct comprehensive evaluations +of our models through qualitative and quantitative experiments as well as a +user study. + +
+
+
+
+
+ + ♻ ☆ Losing Visual Needles in Image Haystacks: Vision Language Models are + Easily Distracted in Short and Long Contexts + + +
+ We present LoCoVQA, a dynamic benchmark generator for evaluating long-context +extractive reasoning in vision language models (VLMs). LoCoVQA augments test +examples for mathematical reasoning, VQA, and character recognition tasks with +increasingly long visual contexts composed of both in-distribution and +out-of-distribution distractor images. + Across these tasks, a diverse set of VLMs rapidly lose performance as the +visual context length grows, often exhibiting a striking logarithmic decay +trend. This test assesses how well VLMs can ignore irrelevant information when +answering queries -- a task that is quite easy for language models (LMs) in the +text domain -- demonstrating that current state-of-the-art VLMs lack this +essential capability for many long-context applications. + +
+
+ comment: Under review. Minor errata correction in revision +
+
+
+
+
+ + ♻ ☆ Video Watermarking: Safeguarding Your Video from (Unauthorized) + Annotations by Video-based LLMs + + +
+ The advent of video-based Large Language Models (LLMs) has significantly +enhanced video understanding. However, it has also raised some safety concerns +regarding data protection, as videos can be more easily annotated, even without +authorization. This paper introduces Video Watermarking, a novel technique to +protect videos from unauthorized annotations by such video-based LLMs, +especially concerning the video content and description, in response to +specific queries. By imperceptibly embedding watermarks into key video frames +with multi-modal flow-based losses, our method preserves the viewing experience +while preventing misuse by video-based LLMs. Extensive experiments show that +Video Watermarking significantly reduces the comprehensibility of videos with +various video-based LLMs, demonstrating both stealth and robustness. In +essence, our method provides a solution for securing video content, ensuring +its integrity and confidentiality in the face of evolving video-based LLMs +technologies. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2403.13507 +
+
+
+
+
+ + ♻ ☆ Rethinking Efficient and Effective Point-based Networks for Event Camera + Classification and Regression: EventMamba + + +
+ Event cameras, drawing inspiration from biological systems, efficiently +detect changes in ambient light with low latency and high dynamic range while +consuming minimal power. The most current approach to processing event data +often involves converting it into frame-based representations, which is +well-established in traditional vision. However, this approach neglects the +sparsity of event data, loses fine-grained temporal information during the +transformation process, and increases the computational burden, making it +ineffective for characterizing event camera properties. In contrast, Point +Cloud is a popular representation for 3D processing and is better suited to +match the sparse and asynchronous nature of the event camera. Nevertheless, +despite the theoretical compatibility of point-based methods with event +cameras, the results show a performance gap that is not yet satisfactory +compared to frame-based methods. In order to bridge the performance gap, we +propose EventMamba, an efficient and effective Point Cloud framework that +achieves competitive results even compared to the state-of-the-art (SOTA) +frame-based method in both classification and regression tasks. This notable +accomplishment is facilitated by our rethinking of the distinction between +Event Cloud and Point Cloud, emphasizing effective temporal information +extraction through optimized network structures. Specifically, EventMamba +leverages temporal aggregation and State Space Model (SSM) based Mamba boasting +enhanced temporal information extraction capabilities. Through a hierarchical +structure, EventMamba is adept at abstracting local and global spatial features +and implicit and explicit temporal features. By adhering to the lightweight +design principle, EventMamba delivers impressive results with minimal +computational resource utilization, demonstrating its efficiency and +effectiveness. + +
+
+ comment: Extension Journal of TTPOINT and PEPNet, modify the dataset split + method +
+
+
+
+
+ + ♻ ☆ Evaluating and Analyzing Relationship Hallucinations in LVLMs ICML2024 + + +
+ The issue of hallucinations is a prevalent concern in existing Large +Vision-Language Models (LVLMs). Previous efforts have primarily focused on +investigating object hallucinations, which can be easily alleviated by +introducing object detectors. However, these efforts neglect hallucinations in +inter-object relationships, which is essential for visual comprehension. In +this work, we introduce R-Bench, a novel benchmark for evaluating Vision +Relationship Hallucination. R-Bench features image-level questions that focus +on the existence of relationships and instance-level questions that assess +local visual comprehension. We identify three types of relationship +co-occurrences that lead to hallucinations: relationship-relationship, +subject-relationship, and relationship-object. The visual instruction tuning +dataset's long-tail distribution significantly impacts LVLMs' understanding of +visual relationships. Furthermore, our analysis reveals that current LVLMs tend +to disregard visual content and overly rely on the common sense knowledge of +Large Language Models. They also struggle with reasoning about spatial +relationships based on contextual information. + +
+
+ comment: ICML2024; Project Page:https://github.com/mrwu-mac/R-Bench +
+
+
+
+
+ + ♻ ☆ Deep learning for 3D human pose estimation and mesh recovery: A survey + + +
+ 3D human pose estimation and mesh recovery have attracted widespread research +interest in many areas, such as computer vision, autonomous driving, and +robotics. Deep learning on 3D human pose estimation and mesh recovery has +recently thrived, with numerous methods proposed to address different problems +in this area. In this paper, to stimulate future research, we present a +comprehensive review of recent progress over the past five years in deep +learning methods for this area by delving into over 200 references. To the best +of our knowledge, this survey is arguably the first to comprehensively cover +deep learning methods for 3D human pose estimation, including both +single-person and multi-person approaches, as well as human mesh recovery, +encompassing methods based on explicit models and implicit representations. We +also present comparative results on several publicly available datasets, +together with insightful observations and inspiring future research directions. +A regularly updated project page can be found at +https://github.com/liuyangme/SOTA-3DHPE-HMR. + +
+
+
+
+
+ + ♻ ☆ ShadowRefiner: Towards Mask-free Shadow Removal via Fast Fourier + Transformer CVPR + + +
+ Shadow-affected images often exhibit pronounced spatial discrepancies in +color and illumination, consequently degrading various vision applications +including object detection and segmentation systems. To effectively eliminate +shadows in real-world images while preserving intricate details and producing +visually compelling outcomes, we introduce a mask-free Shadow Removal and +Refinement network (ShadowRefiner) via Fast Fourier Transformer. Specifically, +the Shadow Removal module in our method aims to establish effective mappings +between shadow-affected and shadow-free images via spatial and frequency +representation learning. To mitigate the pixel misalignment and further improve +the image quality, we propose a novel Fast-Fourier Attention based Transformer +(FFAT) architecture, where an innovative attention mechanism is designed for +meticulous refinement. Our method wins the championship in the Perceptual Track +and achieves the second best performance in the Fidelity Track of NTIRE 2024 +Image Shadow Removal Challenge. Besides, comprehensive experiment result also +demonstrate the compelling effectiveness of our proposed method. The code is +publicly available: https://github.com/movingforward100/Shadow_R. + +
+
+ comment: Accepted by CVPR workshop 2024 (NTIRE 2024); Corrected references +
+
+
+
+
+ + ♻ ☆ Camera-LiDAR Cross-modality Gait Recognition ECCV 2024 + + +
+ Gait recognition is a crucial biometric identification technique. +Camera-based gait recognition has been widely applied in both research and +industrial fields. LiDAR-based gait recognition has also begun to evolve most +recently, due to the provision of 3D structural information. However, in +certain applications, cameras fail to recognize persons, such as in low-light +environments and long-distance recognition scenarios, where LiDARs work well. +On the other hand, the deployment cost and complexity of LiDAR systems limit +its wider application. Therefore, it is essential to consider cross-modality +gait recognition between cameras and LiDARs for a broader range of +applications. In this work, we propose the first cross-modality gait +recognition framework between Camera and LiDAR, namely CL-Gait. It employs a +two-stream network for feature embedding of both modalities. This poses a +challenging recognition task due to the inherent matching between 3D and 2D +data, exhibiting significant modality discrepancy. To align the feature spaces +of the two modalities, i.e., camera silhouettes and LiDAR points, we propose a +contrastive pre-training strategy to mitigate modality discrepancy. To make up +for the absence of paired camera-LiDAR data for pre-training, we also introduce +a strategy for generating data on a large scale. This strategy utilizes +monocular depth estimated from single RGB images and virtual cameras to +generate pseudo point clouds for contrastive pre-training. Extensive +experiments show that the cross-modality gait recognition is very challenging +but still contains potential and feasibility with our proposed model and +pre-training strategy. To the best of our knowledge, this is the first work to +address cross-modality gait recognition. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ A Framework For Refining Text Classification and Object Recognition from + Academic Articles + + +
+ With the widespread use of the internet, it has become increasingly crucial +to extract specific information from vast amounts of academic articles +efficiently. Data mining techniques are generally employed to solve this issue. +However, data mining for academic articles is challenging since it requires +automatically extracting specific patterns in complex and unstructured layout +documents. Current data mining methods for academic articles employ +rule-based(RB) or machine learning(ML) approaches. However, using rule-based +methods incurs a high coding cost for complex typesetting articles. On the +other hand, simply using machine learning methods requires annotation work for +complex content types within the paper, which can be costly. Furthermore, only +using machine learning can lead to cases where patterns easily recognized by +rule-based methods are mistakenly extracted. To overcome these issues, from the +perspective of analyzing the standard layout and typesetting used in the +specified publication, we emphasize implementing specific methods for specific +characteristics in academic articles. We have developed a novel Text Block +Refinement Framework (TBRF), a machine learning and rule-based scheme hybrid. +We used the well-known ACL proceeding articles as experimental data for the +validation experiment. The experiment shows that our approach achieved over 95% +classification accuracy and 90% detection accuracy for tables and figures. + +
+
+ comment: This paper has been accepted at 'The International Symposium on + Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)' +
+
+
+
+
+ + ♻ ☆ What's color got to do with it? Face recognition in grayscale CVPR + + +
+ State-of-the-art deep CNN face matchers are typically created using extensive +training sets of color face images. Our study reveals that such matchers attain +virtually identical accuracy when trained on either grayscale or color versions +of the training set, even when the evaluation is done using color test images. +Furthermore, we demonstrate that shallower models, lacking the capacity to +model complex representations, rely more heavily on low-level features such as +those associated with color. As a result, they display diminished accuracy when +trained with grayscale images. We then consider possible causes for deeper CNN +face matchers "not seeing color". Popular web-scraped face datasets actually +have 30 to 60% of their identities with one or more grayscale images. We +analyze whether this grayscale element in the training set impacts the accuracy +achieved, and conclude that it does not. We demonstrate that using only +grayscale images for both training and testing achieves accuracy comparable to +that achieved using only color images for deeper models. This holds true for +both real and synthetic training datasets. HSV color space, which separates +chroma and luma information, does not improve the network's learning about +color any more than in the RGB color space. We then show that the skin region +of an individual's images in a web-scraped training set exhibits significant +variation in their mapping to color space. This suggests that color carries +limited identity-specific information. We also show that when the first +convolution layer is restricted to a single filter, models learn a grayscale +conversion filter and pass a grayscale version of the input color image to the +next layer. Finally, we demonstrate that leveraging the lower per-image storage +for grayscale to increase the number of images in the training set can improve +accuracy of the face recognition model. + +
+
+ comment: This is replacement version of the previous arxiv submission: + 2309.05180 (Our Deep CNN Face Matchers Have Developed Achromatopsia). The + past version is published in CVPRW and available in IEEE proceedings. This + submitted version is an extension of the conference paper +
+
+
+
+
+ + ♻ ☆ SOAF: Scene Occlusion-aware Neural Acoustic Field + + +
+ This paper tackles the problem of novel view audio-visual synthesis along an +arbitrary trajectory in an indoor scene, given the audio-video recordings from +other known trajectories of the scene. Existing methods often overlook the +effect of room geometry, particularly wall occlusion to sound propagation, +making them less accurate in multi-room environments. In this work, we propose +a new approach called Scene Occlusion-aware Acoustic Field (SOAF) for accurate +sound generation. Our approach derives a prior for sound energy field using +distance-aware parametric sound-propagation modelling and then transforms it +based on scene transmittance learned from the input video. We extract features +from the local acoustic field centred around the receiver using a Fibonacci +Sphere to generate binaural audio for novel views with a direction-aware +attention mechanism. Extensive experiments on the real dataset RWAVS and the +synthetic dataset SoundSpaces demonstrate that our method outperforms previous +state-of-the-art techniques in audio generation. Project page: +https://github.com/huiyu-gao/SOAF/. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Safety of GPT-4o: An Empirical Study using Jailbreak + Attacks + + +
+ The recent release of GPT-4o has garnered widespread attention due to its +powerful general capabilities. While its impressive performance is widely +acknowledged, its safety aspects have not been sufficiently explored. Given the +potential societal impact of risky content generated by advanced generative AI +such as GPT-4o, it is crucial to rigorously evaluate its safety. In response to +this question, this paper for the first time conducts a rigorous evaluation of +GPT-4o against jailbreak attacks. Specifically, this paper adopts a series of +multi-modal and uni-modal jailbreak attacks on 4 commonly used benchmarks +encompassing three modalities (ie, text, speech, and image), which involves the +optimization of over 4,000 initial text queries and the analysis and +statistical evaluation of nearly 8,000+ response on GPT-4o. Our extensive +experiments reveal several novel observations: (1) In contrast to the previous +version (such as GPT-4V), GPT-4o has enhanced safety in the context of text +modality jailbreak; (2) The newly introduced audio modality opens up new attack +vectors for jailbreak attacks on GPT-4o; (3) Existing black-box multimodal +jailbreak attack methods are largely ineffective against GPT-4o and GPT-4V. +These findings provide critical insights into the safety implications of GPT-4o +and underscore the need for robust alignment guardrails in large models. Our +code is available at \url{https://github.com/NY1024/Jailbreak_GPT4o}. + +
+
+
+
+
+
+
+
+ + Information Retrieval 12 + +
+
+
+ + ☆ CoIR: A Comprehensive Benchmark for Code Information Retrieval Models + + +
+ Despite the substantial success of Information Retrieval (IR) in various NLP +tasks, most IR systems predominantly handle queries and corpora in natural +language, neglecting the domain of code retrieval. Code retrieval is critically +important yet remains under-explored, with existing methods and benchmarks +inadequately representing the diversity of code in various domains and tasks. +Addressing this gap, we present \textbf{\name} (\textbf{Co}de +\textbf{I}nformation \textbf{R}etrieval Benchmark), a robust and comprehensive +benchmark specifically designed to assess code retrieval capabilities. \name +comprises \textbf{ten} meticulously curated code datasets, spanning +\textbf{eight} distinctive retrieval tasks across \textbf{seven} diverse +domains. We first discuss the construction of \name and its diverse dataset +composition. Further, we evaluate nine widely used retrieval models using +\name, uncovering significant difficulties in performing code retrieval tasks +even with state-of-the-art systems. To facilitate easy adoption and integration +within existing research workflows, \name has been developed as a user-friendly +Python framework, readily installable via pip. It shares same data schema as +other popular benchmarks like MTEB and BEIR, enabling seamless cross-benchmark +evaluations. Through \name, we aim to invigorate research in the code retrieval +domain, providing a versatile benchmarking tool that encourages further +development and exploration of code retrieval systems\footnote{\url{ +https://github.com/CoIR-team/coir}}. + +
+
+
+
+
+ + ☆ CRUISE on Quantum Computing for Feature Selection in Recommender Systems + + +
+ Using Quantum Computers to solve problems in Recommender Systems that +classical computers cannot address is a worthwhile research topic. In this +paper, we use Quantum Annealers to address the feature selection problem in +recommendation algorithms. This feature selection problem is a Quadratic +Unconstrained Binary Optimization(QUBO) problem. By incorporating +Counterfactual Analysis, we significantly improve the performance of the +item-based KNN recommendation algorithm compared to using pure Mutual +Information. Extensive experiments have demonstrated that the use of +Counterfactual Analysis holds great promise for addressing such problems. + +
+
+ comment: accepted by QuantumCLEF 2024 +
+
+
+
+
+ + ☆ LANE: Logic Alignment of Non-tuning Large Language Models and Online + Recommendation Systems for Explainable Reason Generation + + +
+ The explainability of recommendation systems is crucial for enhancing user +trust and satisfaction. Leveraging large language models (LLMs) offers new +opportunities for comprehensive recommendation logic generation. However, in +existing related studies, fine-tuning LLM models for recommendation tasks +incurs high computational costs and alignment issues with existing systems, +limiting the application potential of proven proprietary/closed-source LLM +models, such as GPT-4. In this work, our proposed effective strategy LANE +aligns LLMs with online recommendation systems without additional LLMs tuning, +reducing costs and improving explainability. This innovative approach addresses +key challenges in integrating language models with recommendation systems while +fully utilizing the capabilities of powerful proprietary models. Specifically, +our strategy operates through several key components: semantic embedding, user +multi-preference extraction using zero-shot prompting, semantic alignment, and +explainable recommendation generation using Chain of Thought (CoT) prompting. +By embedding item titles instead of IDs and utilizing multi-head attention +mechanisms, our approach aligns the semantic features of user preferences with +those of candidate items, ensuring coherent and user-aligned recommendations. +Sufficient experimental results including performance comparison, questionnaire +voting, and visualization cases prove that our method can not only ensure +recommendation performance, but also provide easy-to-understand and reasonable +recommendation logic. + +
+
+
+
+
+ + ☆ Learning Positional Attention for Sequential Recommendation + + +
+ Self-attention-based networks have achieved remarkable performance in +sequential recommendation tasks. A crucial component of these models is +positional encoding. In this study, we delve into the learned positional +embedding, demonstrating that it often captures the distance between tokens. +Building on this insight, we introduce novel attention models that directly +learn positional relations. Extensive experiments reveal that our proposed +models, \textbf{PARec} and \textbf{FPARec} outperform previous +self-attention-based approaches.Our code is available at the link for anonymous +review: https://anonymous.4open.science/ r/FPARec-2C55/ + +
+
+
+
+
+ + ☆ Supporting Cross-language Cross-project Bug Localization Using + Pre-trained Language Models + + +
+ Automatically locating a bug within a large codebase remains a significant +challenge for developers. Existing techniques often struggle with +generalizability and deployment due to their reliance on application-specific +data and large model sizes. This paper proposes a novel pre-trained language +model (PLM) based technique for bug localization that transcends project and +language boundaries. Our approach leverages contrastive learning to enhance the +representation of bug reports and source code. It then utilizes a novel ranking +approach that combines commit messages and code segments. Additionally, we +introduce a knowledge distillation technique that reduces model size for +practical deployment without compromising performance. + This paper presents several key benefits. By incorporating code segment and +commit message analysis alongside traditional file-level examination, our +technique achieves better bug localization accuracy. Furthermore, our model +excels at generalizability - trained on code from various projects and +languages, it can effectively identify bugs in unseen codebases. To address +computational limitations, we propose a CPU-compatible solution. In essence, +proposed work presents a highly effective, generalizable, and efficient bug +localization technique with the potential to real-world deployment. + +
+
+
+
+
+ + ♻ ☆ UFRec: Integrating Uniformity and Frequency to Enhance Sequential + Recommendations + + +
+ Effective representation learning in sequential recommendation systems is +pivotal for precisely capturing user interaction patterns and enhancing +recommendation accuracy. Nonetheless, current methodologies largely focus on +item-to-item transitions, frequently overlooking the time intervals between +interactions, which are integral to understanding behavior pattern shifts. +Moreover, critical interaction attributes like item frequency are often +neglected. Our research indicates that sequences with more consistent time +intervals and items with higher interaction frequency result in superior +predictive performance. In contrast, sequences with non-uniform intervals +contribute to user interest drift, and infrequently interacted items are +challenging to model due to sparse data, posing unique challenges that existing +methods fail to adequately address. In this study, we introduce UFRec, an +innovative bidirectional enhancement method for sequential recommendations. +UFRec harnesses sequence uniformity and item frequency to boost performance, +particularly improving the representation of non-uniform sequences and +less-frequent items. These two components synergistically enhance each other, +driving holistic performance optimization in intricate sequential +recommendation scenarios. Additionally, we introduce a multidimensional time +module to further augment adaptability. To the best of our knowledge, UFRec is +the pioneering method to exploit the properties of uniformity and frequency for +feature augmentation. Through comparisons with eleven state-of-the-art models +across four datasets, we demonstrate that UFRec significantly surpasses current +leading models. + +
+
+ comment: 15 pages, 8 figures, for source code, see + https://github.com/Linxi000/UniRec +
+
+
+
+
+ + ♻ ☆ Understanding Language Modeling Paradigm Adaptations in Recommender + Systems: Lessons Learned and Open Challenges ECAI + + +
+ The emergence of Large Language Models (LLMs) has achieved tremendous success +in the field of Natural Language Processing owing to diverse training paradigms +that empower LLMs to effectively capture intricate linguistic patterns and +semantic representations. In particular, the recent "pre-train, prompt and +predict" training paradigm has attracted significant attention as an approach +for learning generalizable models with limited labeled data. In line with this +advancement, these training paradigms have recently been adapted to the +recommendation domain and are seen as a promising direction in both academia +and industry. This half-day tutorial aims to provide a thorough understanding +of extracting and transferring knowledge from pre-trained models learned +through different training paradigms to improve recommender systems from +various perspectives, such as generality, sparsity, effectiveness and +trustworthiness. In this tutorial, we first introduce the basic concepts and a +generic architecture of the language modeling paradigm for recommendation +purposes. Then, we focus on recent advancements in adapting LLM-related +training strategies and optimization objectives for different recommendation +tasks. After that, we will systematically introduce ethical issues in LLM-based +recommender systems and discuss possible approaches to assessing and mitigating +them. We will also summarize the relevant datasets, evaluation metrics, and an +empirical study on the recommendation performance of training paradigms. +Finally, we will conclude the tutorial with a discussion of open challenges and +future directions. + +
+
+ comment: Tutorial held at the 27th European Conference on Artificial + Intelligence (ECAI) in Santiago de Compostela, Spain, on October 19-24, 2024 +
+
+
+
+
+ + ♻ ☆ EAGER: Two-Stream Generative Recommender with Behavior-Semantic + Collaboration KDD 2024 + + +
+ Generative retrieval has recently emerged as a promising approach to +sequential recommendation, framing candidate item retrieval as an +autoregressive sequence generation problem. However, existing generative +methods typically focus solely on either behavioral or semantic aspects of item +information, neglecting their complementary nature and thus resulting in +limited effectiveness. To address this limitation, we introduce EAGER, a novel +generative recommendation framework that seamlessly integrates both behavioral +and semantic information. Specifically, we identify three key challenges in +combining these two types of information: a unified generative architecture +capable of handling two feature types, ensuring sufficient and independent +learning for each type, and fostering subtle interactions that enhance +collaborative information utilization. To achieve these goals, we propose (1) a +two-stream generation architecture leveraging a shared encoder and two separate +decoders to decode behavior tokens and semantic tokens with a confidence-based +ranking strategy; (2) a global contrastive task with summary tokens to achieve +discriminative decoding for each type of information; and (3) a semantic-guided +transfer task designed to implicitly promote cross-interactions through +reconstruction and estimation objectives. We validate the effectiveness of +EAGER on four public benchmarks, demonstrating its superior performance +compared to existing methods. + +
+
+ comment: Accepted by KDD 2024. Code available at + https://reczoo.github.io/EAGER +
+
+
+
+
+ + ♻ ☆ Multimodal Pretraining, Adaptation, and Generation for Recommendation: A + Survey KDD 2024 + + +
+ Personalized recommendation serves as a ubiquitous channel for users to +discover information tailored to their interests. However, traditional +recommendation models primarily rely on unique IDs and categorical features for +user-item matching, potentially overlooking the nuanced essence of raw item +contents across multiple modalities such as text, image, audio, and video. This +underutilization of multimodal data poses a limitation to recommender systems, +especially in multimedia services like news, music, and short-video platforms. +The recent advancements in large multimodal models offer new opportunities and +challenges in developing content-aware recommender systems. This survey seeks +to provide a comprehensive exploration of the latest advancements and future +trajectories in multimodal pretraining, adaptation, and generation techniques, +as well as their applications in enhancing recommender systems. Furthermore, we +discuss current open challenges and opportunities for future research in this +dynamic domain. We believe that this survey, alongside the curated resources, +will provide valuable insights to inspire further advancements in this evolving +landscape. + +
+
+ comment: Accepted by KDD 2024. See our tutorial materials at + https://mmrec.github.io +
+
+
+
+
+ + ♻ ☆ Pistis-RAG: A Scalable Cascading Framework Towards Content-Centric + Retrieval-Augmented Generation + + +
+ In Greek mythology, Pistis symbolized good faith, trust, and reliability. +Drawing inspiration from these principles, Pistis-RAG is a scalable multi-stage +framework designed to address the challenges of large-scale retrieval-augmented +generation (RAG) systems. This framework consists of distinct stages: matching, +pre-ranking, ranking, reasoning, and aggregating. Each stage contributes to +narrowing the search space, prioritizing semantically relevant documents, +aligning with the large language model's (LLM) preferences, supporting complex +chain-of-thought (CoT) methods, and combining information from multiple +sources. + Our ranking stage introduces a significant innovation by recognizing that +semantic relevance alone may not lead to improved generation quality, due to +the sensitivity of the few-shot prompt order, as noted in previous research. +This critical aspect is often overlooked in current RAG frameworks. + We argue that the alignment issue between LLMs and external knowledge ranking +methods is tied to the model-centric paradigm dominant in RAG systems. We +propose a content-centric approach, emphasizing seamless integration between +LLMs and external information sources to optimize content transformation for +specific tasks. + Our novel ranking stage is designed specifically for RAG systems, +incorporating principles of information retrieval while considering the unique +business scenarios reflected in LLM preferences and user feedback. We simulated +feedback signals on the MMLU benchmark, resulting in a 9.3% performance +improvement. Our model and code will be open-sourced on GitHub. Additionally, +experiments on real-world, large-scale data validate the scalability of our +framework. + +
+
+
+
+
+ + ♻ ☆ Temporal Interest Network for User Response Prediction + + +
+ User response prediction is essential in industrial recommendation systems, +such as online display advertising. Among all the features in recommendation +models, user behaviors are among the most critical. Many works have revealed +that a user's behavior reflects her interest in the candidate item, owing to +the semantic or temporal correlation between behaviors and the candidate. While +the literature has individually examined each of these correlations, +researchers have yet to analyze them in combination, that is, the +semantic-temporal correlation. We empirically measure this correlation and +observe intuitive yet robust patterns. We then examine several popular user +interest models and find that, surprisingly, none of them learn such +correlation well. + To fill this gap, we propose a Temporal Interest Network (TIN) to capture the +semantic-temporal correlation simultaneously between behaviors and the target. +We achieve this by incorporating target-aware temporal encoding, in addition to +semantic encoding, to represent behaviors and the target. Furthermore, we +conduct explicit 4-way interaction by deploying target-aware attention and +target-aware representation to capture both semantic and temporal correlation. +We conduct comprehensive evaluations on two popular public datasets, and our +proposed TIN outperforms the best-performing baselines by 0.43% and 0.29% on +GAUC, respectively. During online A/B testing in Tencent's advertising +platform, TIN achieves 1.65% cost lift and 1.93% GMV lift over the base model. +It has been successfully deployed in production since October 2023, serving the +WeChat Moments traffic. We have released our code at +https://github.com/zhouxy1003/TIN. + +
+
+
+
+
+ + ♻ ☆ Improving Sequential Query Recommendation with Immediate User Feedback + + +
+ We propose an algorithm for next query recommendation in interactive data +exploration settings, like knowledge discovery for information gathering. The +state-of-the-art query recommendation algorithms are based on +sequence-to-sequence learning approaches that exploit historical interaction +data. Due to the supervision involved in the learning process, such approaches +fail to adapt to immediate user feedback. We propose to augment the +transformer-based causal language models for query recommendations to adapt to +the immediate user feedback using multi-armed bandit (MAB) framework. We +conduct a large-scale experimental study using log files from a popular online +literature discovery service and demonstrate that our algorithm improves the +per-round regret substantially, with respect to the state-of-the-art +transformer-based query recommendation models, which do not make use of +immediate user feedback. Our data model and source code are available at +https://github.com/shampp/exp3_ss + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Planetarium: A Rigorous Benchmark for Translating Text to Structured + Planning Languages + + +
+ Many recent works have explored using language models for planning problems. +One line of research focuses on translating natural language descriptions of +planning tasks into structured planning languages, such as the planning domain +definition language (PDDL). While this approach is promising, accurately +measuring the quality of generated PDDL code continues to pose significant +challenges. First, generated PDDL code is typically evaluated using planning +validators that check whether the problem can be solved with a planner. This +method is insufficient because a language model might generate valid PDDL code +that does not align with the natural language description of the task. Second, +existing evaluation sets often have natural language descriptions of the +planning task that closely resemble the ground truth PDDL, reducing the +challenge of the task. To bridge this gap, we introduce \benchmarkName, a +benchmark designed to evaluate language models' ability to generate PDDL code +from natural language descriptions of planning tasks. We begin by creating a +PDDL equivalence algorithm that rigorously evaluates the correctness of PDDL +code generated by language models by flexibly comparing it against a ground +truth PDDL. Then, we present a dataset of $132,037$ text-to-PDDL pairs across +13 different tasks, with varying levels of difficulty. Finally, we evaluate +several API-access and open-weight language models that reveal this task's +complexity. For example, $87.6\%$ of the PDDL problem descriptions generated by +GPT-4o are syntactically parseable, $82.2\%$ are valid, solve-able problems, +but only $35.1\%$ are semantically correct, highlighting the need for a more +rigorous benchmark for this problem. + +
+
+
+
+
+ + ☆ Value-Penalized Auxiliary Control from Examples for Learning without + Rewards or Demonstrations + + +
+ Learning from examples of success is an appealing approach to reinforcement +learning that eliminates many of the disadvantages of using hand-crafted reward +functions or full expert-demonstration trajectories, both of which can be +difficult to acquire, biased, or suboptimal. However, learning from examples +alone dramatically increases the exploration challenge, especially for complex +tasks. This work introduces value-penalized auxiliary control from examples +(VPACE); we significantly improve exploration in example-based control by +adding scheduled auxiliary control and examples of auxiliary tasks. +Furthermore, we identify a value-calibration problem, where policy value +estimates can exceed their theoretical limits based on successful data. We +resolve this problem, which is exacerbated by learning auxiliary tasks, through +the addition of an above-success-level value penalty. Across three simulated +and one real robotic manipulation environment, and 21 different main tasks, we +show that our approach substantially improves learning efficiency. Videos, +code, and datasets are available at https://papers.starslab.ca/vpace. + +
+
+ comment: Submitted to the Conference on Robot Learning (CoRL'24), Munich, + Germany, Nov. 6-9, 2024 +
+
+
+
+
+ + ☆ Universal Length Generalization with Turing Programs + + +
+ Length generalization refers to the ability to extrapolate from short +training sequences to long test sequences and is a challenge for current large +language models. While prior work has proposed some architecture or data format +changes to achieve length generalization, these proposals typically apply to a +limited set of tasks. Building on prior scratchpad and Chain-of-Thought (CoT) +techniques, we propose Turing Programs, a novel CoT strategy that decomposes an +algorithmic task into steps mimicking the computation of a Turing Machine. This +framework is both universal, as it can accommodate any algorithmic task, and +simple, requiring only copying text from the context with small modifications. +We show that by using Turing Programs, we obtain robust length generalization +on a range of algorithmic tasks: addition, multiplication and in-context SGD. +We then demonstrate that transformers achieve length generalization on random +Turing Programs, suggesting that length generalization is possible for any +algorithmic task. Finally, we theoretically prove that transformers can +implement Turing Programs, constructing a simple RASP (Weiss et al.) program +that simulates an arbitrary Turing machine. + +
+
+
+
+
+ + ☆ DisCo-Diff: Enhancing Continuous Diffusion Models with Discrete Latents + + +
+ Diffusion models (DMs) have revolutionized generative learning. They utilize +a diffusion process to encode data into a simple Gaussian distribution. +However, encoding a complex, potentially multimodal data distribution into a +single continuous Gaussian distribution arguably represents an unnecessarily +challenging learning problem. We propose Discrete-Continuous Latent Variable +Diffusion Models (DisCo-Diff) to simplify this task by introducing +complementary discrete latent variables. We augment DMs with learnable discrete +latents, inferred with an encoder, and train DM and encoder end-to-end. +DisCo-Diff does not rely on pre-trained networks, making the framework +universally applicable. The discrete latents significantly simplify learning +the DM's complex noise-to-data mapping by reducing the curvature of the DM's +generative ODE. An additional autoregressive transformer models the +distribution of the discrete latents, a simple step because DisCo-Diff requires +only few discrete variables with small codebooks. We validate DisCo-Diff on toy +data, several image synthesis tasks as well as molecular docking, and find that +introducing discrete latents consistently improves model performance. For +example, DisCo-Diff achieves state-of-the-art FID scores on class-conditioned +ImageNet-64/128 datasets with ODE sampler. + +
+
+ comment: project page: https://research.nvidia.com/labs/lpr/disco-diff +
+
+
+
+
+ + ☆ Vertex Exchange Method for a Class of Quadratic Programming Problems + + +
+ A vertex exchange method is proposed for solving the strongly convex +quadratic program subject to the generalized simplex constraint. We conduct +rigorous convergence analysis for the proposed algorithm and demonstrate its +essential roles in solving some important classes of constrained convex +optimization. To get a feasible initial point to execute the algorithm, we also +present and analyze a highly efficient semismooth Newton method for computing +the projection onto the generalized simplex. The excellent practical +performance of the proposed algorithms is demonstrated by a set of extensive +numerical experiments. Our theoretical and numerical results further motivate +the potential applications of the considered model and the proposed algorithms. + +
+
+ comment: 32 pages, 5 tables +
+
+
+
+
+ + ☆ Correlated Privacy Mechanisms for Differentially Private Distributed + Mean Estimation + + +
+ Differentially private distributed mean estimation (DP-DME) is a fundamental +building block in privacy-preserving federated learning, where a central server +estimates the mean of $d$-dimensional vectors held by $n$ users while ensuring +$(\epsilon,\delta)$-DP. Local differential privacy (LDP) and distributed DP +with secure aggregation (SecAgg) are the most common notions of DP used in +DP-DME settings with an untrusted server. LDP provides strong resilience to +dropouts, colluding users, and malicious server attacks, but suffers from poor +utility. In contrast, SecAgg-based DP-DME achieves an $O(n)$ utility gain over +LDP in DME, but requires increased communication and computation overheads and +complex multi-round protocols to handle dropouts and malicious attacks. In this +work, we propose CorDP-DME, a novel DP-DME mechanism that spans the gap between +DME with LDP and distributed DP, offering a favorable balance between utility +and resilience to dropout and collusion. CorDP-DME is based on correlated +Gaussian noise, ensuring DP without the perfect conditional privacy guarantees +of SecAgg-based approaches. We provide an information-theoretic analysis of +CorDP-DME, and derive theoretical guarantees for utility under any given +privacy parameters and dropout/colluding user thresholds. Our results +demonstrate that (anti) correlated Gaussian DP mechanisms can significantly +improve utility in mean estimation tasks compared to LDP -- even in adversarial +settings -- while maintaining better resilience to dropouts and attacks +compared to distributed DP. + +
+
+
+
+
+ + ☆ Do Quantum Neural Networks have Simplicity Bias? + + +
+ One hypothesis for the success of deep neural networks (DNNs) is that they +are highly expressive, which enables them to be applied to many problems, and +they have a strong inductive bias towards solutions that are simple, known as +simplicity bias, which allows them to generalise well on unseen data because +most real-world data is structured (i.e. simple). In this work, we explore the +inductive bias and expressivity of quantum neural networks (QNNs), which gives +us a way to compare their performance to those of DNNs. Our results show that +it is possible to have simplicity bias with certain QNNs, but we prove that +this type of QNN limits the expressivity of the QNN. We also show that it is +possible to have QNNs with high expressivity, but they either have no inductive +bias or a poor inductive bias and result in a worse generalisation performance +compared to DNNs. We demonstrate that an artificial (restricted) inductive bias +can be produced by intentionally restricting the expressivity of a QNN. Our +results suggest a bias-expressivity tradeoff. Our conclusion is that the QNNs +we studied can not generally offer an advantage over DNNs, because these QNNs +either have a poor inductive bias or poor expressivity compared to DNNs. + +
+
+ comment: 9 pages, 42 pages with appendices +
+
+
+
+
+ + ☆ Nearly Linear Sparsification of $\ell_p$ Subspace Approximation + + +
+ The $\ell_p$ subspace approximation problem is an NP-hard low rank +approximation problem that generalizes the median hyperplane problem ($p = 1$), +principal component analysis ($p = 2$), and the center hyperplane problem ($p = +\infty$). A popular approach to cope with the NP-hardness of this problem is to +compute a strong coreset, which is a small weighted subset of the input points +which simultaneously approximates the cost of every $k$-dimensional subspace, +typically to $(1+\varepsilon)$ relative error for a small constant +$\varepsilon$. + We obtain the first algorithm for constructing a strong coreset for $\ell_p$ +subspace approximation with a nearly optimal dependence on the rank parameter +$k$, obtaining a nearly linear bound of $\tilde +O(k)\mathrm{poly}(\varepsilon^{-1})$ for $p<2$ and $\tilde +O(k^{p/2})\mathrm{poly}(\varepsilon^{-1})$ for $p>2$. Prior constructions +either achieved a similar size bound but produced a coreset with a modification +of the original points [SW18, FKW21], or produced a coreset of the original +points but lost $\mathrm{poly}(k)$ factors in the coreset size [HV20, WY23]. + Our techniques also lead to the first nearly optimal online strong coresets +for $\ell_p$ subspace approximation with similar bounds as the offline setting, +resolving a problem of [WY23]. All prior approaches lose $\mathrm{poly}(k)$ +factors in this setting, even when allowed to modify the original points. + +
+
+
+
+
+ + ☆ Magnetic Hysteresis Modeling with Neural Operators + + +
+ Hysteresis modeling is crucial to comprehend the behavior of magnetic +devices, facilitating optimal designs. Hitherto, deep learning-based methods +employed to model hysteresis, face challenges in generalizing to novel input +magnetic fields. This paper addresses the generalization challenge by proposing +neural operators for modeling constitutive laws that exhibit magnetic +hysteresis by learning a mapping between magnetic fields. In particular, two +prominent neural operators -- deep operator network and Fourier neural operator +-- are employed to predict novel first-order reversal curves and minor loops, +where novel means they are not used to train the model. In addition, a +rate-independent Fourier neural operator is proposed to predict material +responses at sampling rates different from those used during training to +incorporate the rate-independent characteristics of magnetic hysteresis. The +presented numerical experiments demonstrate that neural operators efficiently +model magnetic hysteresis, outperforming the traditional neural recurrent +methods on various metrics and generalizing to novel magnetic fields. The +findings emphasize the advantages of using neural operators for modeling +hysteresis under varying magnetic conditions, underscoring their importance in +characterizing magnetic material based devices. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Modern Neighborhood Components Analysis: A Deep Tabular Baseline Two + Decades Later + + +
+ The growing success of deep learning in various domains has prompted +investigations into its application to tabular data, where deep models have +shown promising results compared to traditional tree-based methods. In this +paper, we revisit Neighborhood Component Analysis (NCA), a classic tabular +prediction method introduced in 2004, designed to learn a linear projection +that captures semantic similarities between instances. We find that minor +modifications, such as adjustments to the learning objectives and the +integration of deep learning architectures, significantly enhance NCA's +performance, enabling it to surpass most modern deep tabular models. +Additionally, we introduce a stochastic neighbor sampling strategy that +improves both the efficiency and predictive accuracy of our proposed ModernNCA +-- sampling only a subset of neighbors during training, while utilizing the +entire neighborhood during inference. Extensive experiments demonstrate that +our ModernNCA achieves state-of-the-art results in both classification and +regression tasks across various tabular datasets, outperforming both tree-based +and other deep tabular models, while also reducing training time and model +size. + +
+
+
+
+
+ + ☆ When big data actually are low-rank, or entrywise approximation of + certain function-generated matrices + + +
+ The article concerns low-rank approximation of matrices generated by sampling +a smooth function of two $m$-dimensional variables. We refute an argument made +in the literature that, for a specific class of analytic functions, such +matrices admit accurate entrywise approximation of rank that is independent of +$m$. We provide a theoretical explanation of the numerical results presented in +support of this argument, describing three narrower classes of functions for +which $n \times n$ function-generated matrices can be approximated within an +entrywise error of order $\varepsilon$ with rank $\mathcal{O}(\log(n) +\varepsilon^{-2} \mathrm{polylog}(\varepsilon^{-1}))$ that is independent of +the dimension $m$: (i) functions of the inner product of the two variables, +(ii) functions of the squared Euclidean distance between the variables, and +(iii) shift-invariant positive-definite kernels. We extend our argument to +low-rank tensor-train approximation of tensors generated with functions of the +multi-linear product of their $m$-dimensional variables. We discuss our results +in the context of low-rank approximation of attention in transformer neural +networks. + +
+
+
+
+
+ + ☆ Terrain Classification Enhanced with Uncertainty for Space Exploration + Robots from Proprioceptive Data ICML 2023 + + +
+ Terrain Classification is an essential task in space exploration, where +unpredictable environments are difficult to observe using only exteroceptive +sensors such as vision. Implementing Neural Network classifiers can have high +performance but can be deemed untrustworthy as they lack transparency, which +makes them unreliable for taking high-stakes decisions during mission planning. +We address this by proposing Neural Networks with Uncertainty Quantification in +Terrain Classification. We enable our Neural Networks with Monte Carlo Dropout, +DropConnect, and Flipout in time series-capable architectures using only +proprioceptive data as input. We use Bayesian Optimization with Hyperband for +efficient hyperparameter optimization to find optimal models for trustworthy +terrain classification. + +
+
+ comment: 6 pages, 4 figures. LatinX in AI Workshop @ ICML 2023 Camera Ready +
+
+
+
+
+ + ☆ Self-Evaluation as a Defense Against Adversarial Attacks on LLMs + + +
+ When LLMs are deployed in sensitive, human-facing settings, it is crucial +that they do not output unsafe, biased, or privacy-violating outputs. For this +reason, models are both trained and instructed to refuse to answer unsafe +prompts such as "Tell me how to build a bomb." We find that, despite these +safeguards, it is possible to break model defenses simply by appending a space +to the end of a model's input. In a study of eight open-source models, we +demonstrate that this acts as a strong enough attack to cause the majority of +models to generate harmful outputs with very high success rates. We examine the +causes of this behavior, finding that the contexts in which single spaces occur +in tokenized training data encourage models to generate lists when prompted, +overriding training signals to refuse to answer unsafe requests. Our findings +underscore the fragile state of current model alignment and promote the +importance of developing more robust alignment methods. Code and data will be +made available at https://github.com/Linlt-leon/Adversarial-Alignments. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Single Character Perturbations Break LLM Alignment + + +
+ When LLMs are deployed in sensitive, human-facing settings, it is crucial +that they do not output unsafe, biased, or privacy-violating outputs. For this +reason, models are both trained and instructed to refuse to answer unsafe +prompts such as "Tell me how to build a bomb." We find that, despite these +safeguards, it is possible to break model defenses simply by appending a space +to the end of a model's input. In a study of eight open-source models, we +demonstrate that this acts as a strong enough attack to cause the majority of +models to generate harmful outputs with very high success rates. We examine the +causes of this behavior, finding that the contexts in which single spaces occur +in tokenized training data encourage models to generate lists when prompted, +overriding training signals to refuse to answer unsafe requests. Our findings +underscore the fragile state of current model alignment and promote the +importance of developing more robust alignment methods. Code and data will be +available at https://github.com/hannah-aught/space_attack. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ How Does Quantization Affect Multilingual LLMs? + + +
+ Quantization techniques are widely used to improve inference speed and +deployment of large language models. While a wide body of work examines the +impact of quantized LLMs on English tasks, none have examined the effect of +quantization across languages. We conduct a thorough analysis of quantized +multilingual LLMs, focusing on their performance across languages and at +varying scales. We use automatic benchmarks, LLM-as-a-Judge methods, and human +evaluation, finding that (1) harmful effects of quantization are apparent in +human evaluation, and automatic metrics severely underestimate the detriment: a +1.7% average drop in Japanese across automatic tasks corresponds to a 16.0% +drop reported by human evaluators on realistic prompts; (2) languages are +disparately affected by quantization, with non-Latin script languages impacted +worst; and (3) challenging tasks such as mathematical reasoning degrade +fastest. As the ability to serve low-compute models is critical for wide global +adoption of NLP technologies, our results urge consideration of multilingual +performance as a key evaluation criterion for efficient models. + +
+
+
+
+
+ + ☆ Combining AI Control Systems and Human Decision Support via Robustness + and Criticality + + +
+ AI-enabled capabilities are reaching the requisite level of maturity to be +deployed in the real world, yet do not always make correct or safe decisions. +One way of addressing these concerns is to leverage AI control systems +alongside and in support of human decisions, relying on the AI control system +in safe situations while calling on a human co-decider for critical situations. +We extend a methodology for adversarial explanations (AE) to state-of-the-art +reinforcement learning frameworks, including MuZero. Multiple improvements to +the base agent architecture are proposed. We demonstrate how this technology +has two applications: for intelligent decision tools and to enhance training / +learning frameworks. In a decision support context, adversarial explanations +help a user make the correct decision by highlighting those contextual factors +that would need to change for a different AI-recommended decision. As another +benefit of adversarial explanations, we show that the learned AI control system +demonstrates robustness against adversarial tampering. Additionally, we +supplement AE by introducing strategically similar autoencoders (SSAs) to help +users identify and understand all salient factors being considered by the AI +system. In a training / learning framework, this technology can improve both +the AI's decisions and explanations through human interaction. Finally, to +identify when AI decisions would most benefit from human oversight, we tie this +combined system to our prior art on statistically verified analyses of the +criticality of decisions at any point in time. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ☆ Incremental Gauss--Newton Methods with Superlinear Convergence Rates + + +
+ This paper addresses the challenge of solving large-scale nonlinear equations +with H\"older continuous Jacobians. We introduce a novel Incremental +Gauss--Newton (IGN) method within explicit superlinear convergence rate, which +outperforms existing methods that only achieve linear convergence rate. In +particular, we formulate our problem by the nonlinear least squares with +finite-sum structure, and our method incrementally iterates with the +information of one component in each round. We also provide a mini-batch +extension to our IGN method that obtains an even faster superlinear convergence +rate. Furthermore, we conduct numerical experiments to show the advantages of +the proposed methods. + +
+
+ comment: 37 pages, 9 figures +
+
+
+
+
+ + ☆ Prediction Instability in Machine Learning Ensembles ICML2024 + + +
+ In machine learning ensembles predictions from multiple models are +aggregated. Despite widespread use and strong performance of ensembles in +applied problems little is known about the mathematical properties of +aggregating models and associated consequences for safe, explainable use of +such models. In this paper we prove a theorem that shows that any ensemble will +exhibit at least one of the following forms of prediction instability. It will +either ignore agreement among all underlying models, change its mind when none +of the underlying models have done so, or be manipulable through inclusion or +exclusion of options it would never actually predict. As a consequence, +ensemble aggregation procedures will always need to balance the benefits of +information use against the risk of these prediction instabilities. This +analysis also sheds light on what specific forms of prediction instability to +expect from particular ensemble algorithms; for example popular tree ensembles +like random forest, or xgboost will violate basic, intuitive monotonicity and +fairness properties. + +
+
+ comment: 15 pages, uses a modified version of ICML2024.sty +
+
+
+
+
+ + ☆ Multiple-Resolution Tokenization for Time Series Forecasting with an + Application to Pricing + + +
+ We propose a transformer architecture for time series forecasting with a +focus on time series tokenisation and apply it to a real-world prediction +problem from the pricing domain. Our architecture aims to learn effective +representations at many scales across all available data simultaneously. The +model contains a number of novel modules: a differentiated form of time series +patching which employs multiple resolutions, a multiple-resolution module for +time-varying known variables, a mixer-based module for capturing cross-series +information, and a novel output head with favourable scaling to account for the +increased number of tokens. We present an application of this model to a real +world prediction problem faced by the markdown team at a very large retailer. +On the experiments conducted our model outperforms in-house models and the +selected existing deep learning architectures. + +
+
+
+
+
+ + ☆ Motion meets Attention: Video Motion Prompts + + +
+ Videos contain rich spatio-temporal information. Traditional methods for +extracting motion, used in tasks such as action recognition, often rely on +visual contents rather than precise motion features. This phenomenon is +referred to as 'blind motion extraction' behavior, which proves inefficient in +capturing motions of interest due to a lack of motion-guided cues. Recently, +attention mechanisms have enhanced many computer vision tasks by effectively +highlighting salient visual areas. Inspired by this, we propose using a +modified Sigmoid function with learnable slope and shift parameters as an +attention mechanism to activate and modulate motion signals derived from frame +differencing maps. This approach generates a sequence of attention maps that +enhance the processing of motion-related video content. To ensure temporally +continuity and smoothness of the attention maps, we apply pair-wise temporal +attention variation regularization to remove unwanted motions (e.g., noise) +while preserving important ones. We then perform Hadamard product between each +pair of attention maps and the original video frames to highlight the evolving +motions of interest over time. These highlighted motions, termed video motion +prompts, are subsequently used as inputs to the model instead of the original +video frames. We formalize this process as a motion prompt layer and +incorporate the regularization term into the loss function to learn better +motion prompts. This layer serves as an adapter between the model and the video +data, bridging the gap between traditional 'blind motion extraction' and the +extraction of relevant motions of interest. + +
+
+ comment: Research report +
+
+
+
+
+ + ☆ Relating CNN-Transformer Fusion Network for Change Detection + + +
+ While deep learning, particularly convolutional neural networks (CNNs), has +revolutionized remote sensing (RS) change detection (CD), existing approaches +often miss crucial features due to neglecting global context and incomplete +change learning. Additionally, transformer networks struggle with low-level +details. RCTNet addresses these limitations by introducing \textbf{(1)} an +early fusion backbone to exploit both spatial and temporal features early on, +\textbf{(2)} a Cross-Stage Aggregation (CSA) module for enhanced temporal +representation, \textbf{(3)} a Multi-Scale Feature Fusion (MSF) module for +enriched feature extraction in the decoder, and \textbf{(4)} an Efficient +Self-deciphering Attention (ESA) module utilizing transformers to capture +global information and fine-grained details for accurate change detection. +Extensive experiments demonstrate RCTNet's clear superiority over traditional +RS image CD methods, showing significant improvement and an optimal balance +between accuracy and computational cost. + +
+
+ comment: accepted by IEEE Conference on Multimedia Expo +
+
+
+
+
+ + ☆ Bunny-VisionPro: Real-Time Bimanual Dexterous Teleoperation for + Imitation Learning + + +
+ Teleoperation is a crucial tool for collecting human demonstrations, but +controlling robots with bimanual dexterous hands remains a challenge. Existing +teleoperation systems struggle to handle the complexity of coordinating two +hands for intricate manipulations. We introduce Bunny-VisionPro, a real-time +bimanual dexterous teleoperation system that leverages a VR headset. Unlike +previous vision-based teleoperation systems, we design novel low-cost devices +to provide haptic feedback to the operator, enhancing immersion. Our system +prioritizes safety by incorporating collision and singularity avoidance while +maintaining real-time performance through innovative designs. Bunny-VisionPro +outperforms prior systems on a standard task suite, achieving higher success +rates and reduced task completion times. Moreover, the high-quality +teleoperation demonstrations improve downstream imitation learning performance, +leading to better generalizability. Notably, Bunny-VisionPro enables imitation +learning with challenging multi-stage, long-horizon dexterous manipulation +tasks, which have rarely been addressed in previous work. Our system's ability +to handle bimanual manipulations while prioritizing safety and real-time +performance makes it a powerful tool for advancing dexterous manipulation and +imitation learning. + +
+
+ comment: project page: https://dingry.github.io/projects/bunny_visionpro.html +
+
+
+
+
+ + ☆ SOS! Soft Prompt Attack Against Open-Source Large Language Models + + +
+ Open-source large language models (LLMs) have become increasingly popular +among both the general public and industry, as they can be customized, +fine-tuned, and freely used. However, some open-source LLMs require approval +before usage, which has led to third parties publishing their own easily +accessible versions. Similarly, third parties have been publishing fine-tuned +or quantized variants of these LLMs. These versions are particularly appealing +to users because of their ease of access and reduced computational resource +demands. This trend has increased the risk of training time attacks, +compromising the integrity and security of LLMs. In this work, we present a new +training time attack, SOS, which is designed to be low in computational demand +and does not require clean data or modification of the model weights, thereby +maintaining the model's utility intact. The attack addresses security issues in +various scenarios, including the backdoor attack, jailbreak attack, and prompt +stealing attack. Our experimental findings demonstrate that the proposed attack +is effective across all evaluated targets. Furthermore, we present the other +side of our SOS technique, namely the copyright token -- a novel technique that +enables users to mark their copyrighted content and prevent models from using +it. + +
+
+
+
+
+ + ☆ Let the Code LLM Edit Itself When You Edit the Code + + +
+ In this work, we investigate a typical scenario in code generation where a +developer edits existing code in real time and requests a code assistant, e.g., +a large language model, to re-predict the next token or next line on the fly. +Naively, the LLM needs to re-encode the entire KV cache to provide an accurate +prediction. However, this process is computationally expensive, especially when +the sequence length is long. Simply encoding the edited subsequence and +integrating it to the original KV cache meets the temporal confusion problem, +leading to significantly worse performance. We address this efficiency and +accuracy trade-off by introducing \underline{\textbf{Positional +\textbf{I}ntegrity \textbf{E}ncoding} (PIE). Building upon the rotary +positional encoding, PIE first removes the rotary matrices in the Key cache +that introduce temporal confusion and then reapplies the correct rotary +matrices. This process ensures that positional relationships between tokens are +correct and requires only a single round of matrix multiplication. We validate +the effectiveness of PIE through extensive experiments on the RepoBench-C-8k +dataset, utilizing DeepSeek-Coder models with 1.3B, 6.7B, and 33B parameters. +Our evaluation includes three real-world coding tasks: code insertion, code +deletion, and multi-place code editing. Results demonstrate that PIE reduces +computational overhead by over 85% compared to the standard full recomputation +approach across all model sizes and tasks while well approximating the model +performance. + +
+
+ comment: Preprint. Work in Progress +
+
+
+
+
+ + ☆ Reinforcement Learning for Sequence Design Leveraging Protein Language + Models + + +
+ Protein sequence design, determined by amino acid sequences, are essential to +protein engineering problems in drug discovery. Prior approaches have resorted +to evolutionary strategies or Monte-Carlo methods for protein design, but often +fail to exploit the structure of the combinatorial search space, to generalize +to unseen sequences. In the context of discrete black box optimization over +large search spaces, learning a mutation policy to generate novel sequences +with reinforcement learning is appealing. Recent advances in protein language +models (PLMs) trained on large corpora of protein sequences offer a potential +solution to this problem by scoring proteins according to their biological +plausibility (such as the TM-score). In this work, we propose to use PLMs as a +reward function to generate new sequences. Yet the PLM can be computationally +expensive to query due to its large size. To this end, we propose an +alternative paradigm where optimization can be performed on scores from a +smaller proxy model that is periodically finetuned, jointly while learning the +mutation policy. We perform extensive experiments on various sequence lengths +to benchmark RL-based approaches, and provide comprehensive evaluations along +biological plausibility and diversity of the protein. Our experimental results +include favorable evaluations of the proposed sequences, along with high +diversity scores, demonstrating that RL is a strong candidate for biological +sequence design. Finally, we provide a modular open source implementation can +be easily integrated in most RL training loops, with support for replacing the +reward model with other PLMs, to spur further research in this domain. The code +for all experiments is provided in the supplementary material. + +
+
+ comment: 22 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ Stereo Risk: A Continuous Modeling Approach to Stereo Matching ICML 2024 + + +
+ We introduce Stereo Risk, a new deep-learning approach to solve the classical +stereo-matching problem in computer vision. As it is well-known that stereo +matching boils down to a per-pixel disparity estimation problem, the popular +state-of-the-art stereo-matching approaches widely rely on regressing the scene +disparity values, yet via discretization of scene disparity values. Such +discretization often fails to capture the nuanced, continuous nature of scene +depth. Stereo Risk departs from the conventional discretization approach by +formulating the scene disparity as an optimal solution to a continuous risk +minimization problem, hence the name "stereo risk". We demonstrate that $L^1$ +minimization of the proposed continuous risk function enhances stereo-matching +performance for deep networks, particularly for disparities with multi-modal +probability distributions. Furthermore, to enable the end-to-end network +training of the non-differentiable $L^1$ risk optimization, we exploited the +implicit function theorem, ensuring a fully differentiable network. A +comprehensive analysis demonstrates our method's theoretical soundness and +superior performance over the state-of-the-art methods across various benchmark +datasets, including KITTI 2012, KITTI 2015, ETH3D, SceneFlow, and Middlebury +2014. + +
+
+ comment: Accepted as an Oral Paper at ICML 2024. Draft info: 18 pages, 6 + Figure, 16 Tables +
+
+
+
+
+ + ☆ Speaker- and Text-Independent Estimation of Articulatory Movements and + Phoneme Alignments from Speech + + +
+ This paper introduces a novel combination of two tasks, previously treated +separately: acoustic-to-articulatory speech inversion (AAI) and +phoneme-to-articulatory (PTA) motion estimation. We refer to this joint task as +acoustic phoneme-to-articulatory speech inversion (APTAI) and explore two +different approaches, both working speaker- and text-independently during +inference. We use a multi-task learning setup, with the end-to-end goal of +taking raw speech as input and estimating the corresponding articulatory +movements, phoneme sequence, and phoneme alignment. While both proposed +approaches share these same requirements, they differ in their way of achieving +phoneme-related predictions: one is based on frame classification, the other on +a two-staged training procedure and forced alignment. We reach competitive +performance of 0.73 mean correlation for the AAI task and achieve up to +approximately 87% frame overlap compared to a state-of-the-art text-dependent +phoneme force aligner. + +
+
+ comment: to be published in Interspeech 2024 proceedings +
+
+
+
+
+ + ☆ Foundations and Frontiers of Graph Learning Theory + + +
+ Recent advancements in graph learning have revolutionized the way to +understand and analyze data with complex structures. Notably, Graph Neural +Networks (GNNs), i.e. neural network architectures designed for learning graph +representations, have become a popular paradigm. With these models being +usually characterized by intuition-driven design or highly intricate +components, placing them within the theoretical analysis framework to distill +the core concepts, helps understand the key principles that drive the +functionality better and guide further development. Given this surge in +interest, this article provides a comprehensive summary of the theoretical +foundations and breakthroughs concerning the approximation and learning +behaviors intrinsic to prevalent graph learning models. Encompassing +discussions on fundamental aspects such as expressiveness power, +generalization, optimization, and unique phenomena such as over-smoothing and +over-squashing, this piece delves into the theoretical foundations and frontier +driving the evolution of graph learning. In addition, this article also +presents several challenges and further initiates discussions on possible +solutions. + +
+
+ comment: 36pages,273references +
+
+
+
+
+ + ☆ Can machine learning solve the challenge of adaptive learning and the + individualization of learning paths? A field experiment in an online learning + platform + + +
+ The individualization of learning contents based on digital technologies +promises large individual and social benefits. However, it remains an open +question how this individualization can be implemented. To tackle this question +we conduct a randomized controlled trial on a large digital self-learning +platform. We develop an algorithm based on two convolutional neural networks +that assigns tasks to $4,365$ learners according to their learning paths. +Learners are randomized into three groups: two treatment groups -- a +group-based adaptive treatment group and an individual adaptive treatment group +-- and one control group. We analyze the difference between the three groups +with respect to effort learners provide and their performance on the platform. +Our null results shed light on the multiple challenges associated with the +individualization of learning paths. + +
+
+
+
+
+ + ☆ How Reliable and Stable are Explanations of XAI Methods? + + +
+ Black box models are increasingly being used in the daily lives of human +beings living in society. Along with this increase, there has been the +emergence of Explainable Artificial Intelligence (XAI) methods aimed at +generating additional explanations regarding how the model makes certain +predictions. In this sense, methods such as Dalex, Eli5, eXirt, Lofo and Shap +emerged as different proposals and methodologies for generating explanations of +black box models in an agnostic way. Along with the emergence of these methods, +questions arise such as "How Reliable and Stable are XAI Methods?". With the +aim of shedding light on this main question, this research creates a pipeline +that performs experiments using the diabetes dataset and four different machine +learning models (LGBM, MLP, DT and KNN), creating different levels of +perturbations of the test data and finally generates explanations from the +eXirt method regarding the confidence of the models and also feature relevances +ranks from all XAI methods mentioned, in order to measure their stability in +the face of perturbations. As a result, it was found that eXirt was able to +identify the most reliable models among all those used. It was also found that +current XAI methods are sensitive to perturbations, with the exception of one +specific method. + +
+
+ comment: 15 pages, 6 figures, submitted to BRACIS 2024 +
+
+
+
+
+ + On Generalization for Generative Flow Networks + + +
+ Generative Flow Networks (GFlowNets) have emerged as an innovative learning +paradigm designed to address the challenge of sampling from an unnormalized +probability distribution, called the reward function. This framework learns a +policy on a constructed graph, which enables sampling from an approximation of +the target probability distribution through successive steps of sampling from +the learned policy. To achieve this, GFlowNets can be trained with various +objectives, each of which can lead to the model s ultimate goal. The +aspirational strength of GFlowNets lies in their potential to discern intricate +patterns within the reward function and their capacity to generalize +effectively to novel, unseen parts of the reward function. This paper attempts +to formalize generalization in the context of GFlowNets, to link generalization +with stability, and also to design experiments that assess the capacity of +these models to uncover unseen parts of the reward function. The experiments +will focus on length generalization meaning generalization to states that can +be constructed only by longer trajectories than those seen in training. + +
+
+
+
+
+ + ☆ Conformal Prediction for Causal Effects of Continuous Treatments + + +
+ Uncertainty quantification of causal effects is crucial for safety-critical +applications such as personalized medicine. A powerful approach for this is +conformal prediction, which has several practical benefits due to +model-agnostic finite-sample guarantees. Yet, existing methods for conformal +prediction of causal effects are limited to binary/discrete treatments and make +highly restrictive assumptions such as known propensity scores. In this work, +we provide a novel conformal prediction method for potential outcomes of +continuous treatments. We account for the additional uncertainty introduced +through propensity estimation so that our conformal prediction intervals are +valid even if the propensity score is unknown. Our contributions are +three-fold: (1) We derive finite-sample prediction intervals for potential +outcomes of continuous treatments. (2) We provide an algorithm for calculating +the derived intervals. (3) We demonstrate the effectiveness of the conformal +prediction intervals in experiments on synthetic and real-world datasets. To +the best of our knowledge, we are the first to propose conformal prediction for +continuous treatments when the propensity score is unknown and must be +estimated from data. + +
+
+
+
+
+ + ☆ Revisiting the Performance of Deep Learning-Based Vulnerability + Detection on Realistic Datasets + + +
+ The impact of software vulnerabilities on everyday software systems is +significant. Despite deep learning models being proposed for vulnerability +detection, their reliability is questionable. Prior evaluations show high +recall/F1 scores of up to 99%, but these models underperform in practical +scenarios, particularly when assessed on entire codebases rather than just the +fixing commit. This paper introduces Real-Vul, a comprehensive dataset +representing real-world scenarios for evaluating vulnerability detection +models. Evaluating DeepWukong, LineVul, ReVeal, and IVDetect shows a +significant drop in performance, with precision decreasing by up to 95 +percentage points and F1 scores by up to 91 points. Furthermore, Model +performance fluctuates based on vulnerability characteristics, with better F1 +scores for information leaks or code injection than for path resolution or +predictable return values. The results highlight a significant performance gap +that needs addressing before deploying deep learning-based vulnerability +detection in practical settings. Overfitting is identified as a key issue, and +an augmentation technique is proposed, potentially improving performance by up +to 30%. Contributions include a dataset creation approach for better model +evaluation, Real-Vul dataset, and empirical evidence of deep learning models +struggling in real-world settings. + +
+
+
+
+
+ + ☆ Spatio-Temporal Adaptive Diffusion Models for EEG Super-Resolution in + Epilepsy Diagnosis + + +
+ Electroencephalogram (EEG) technology, particularly high-density EEG (HD EEG) +devices, is widely used in fields such as neuroscience. HD EEG devices improve +the spatial resolution of EEG by placing more electrodes on the scalp, meeting +the requirements of clinical diagnostic applications such as epilepsy focus +localization. However, this technique faces challenges such as high acquisition +costs and limited usage scenarios. In this paper, spatio-temporal adaptive +diffusion models (STADMs) are proposed to pioneer the use of diffusion models +for achieving spatial SR reconstruction from low-resolution (LR, 64 channels or +fewer) EEG to high-resolution (HR, 256 channels) EEG. Specifically, a +spatio-temporal condition module is designed to extract the spatio-temporal +features of LR EEG, which then serve as conditional inputs to guide the reverse +denoising process of diffusion models. Additionally, a multi-scale Transformer +denoising module is constructed to leverage multi-scale convolution blocks and +cross-attention-based diffusion Transformer blocks for conditional guidance to +generate subject-adaptive SR EEG. Experimental results demonstrate that the +proposed method effectively enhances the spatial resolution of LR EEG and +quantitatively outperforms existing methods. Furthermore, STADMs demonstrate +their value by applying synthetic SR EEG to classification and source +localization tasks of epilepsy patients, indicating their potential to +significantly improve the spatial resolution of LR EEG. + +
+
+
+
+
+ + ☆ Effective Heterogeneous Federated Learning via Efficient + Hypernetwork-based Weight Generation + + +
+ While federated learning leverages distributed client resources, it faces +challenges due to heterogeneous client capabilities. This necessitates +allocating models suited to clients' resources and careful parameter +aggregation to accommodate this heterogeneity. We propose HypeMeFed, a novel +federated learning framework for supporting client heterogeneity by combining a +multi-exit network architecture with hypernetwork-based model weight +generation. This approach aligns the feature spaces of heterogeneous model +layers and resolves per-layer information disparity during weight aggregation. +To practically realize HypeMeFed, we also propose a low-rank factorization +approach to minimize computation and memory overhead associated with +hypernetworks. Our evaluations on a real-world heterogeneous device testbed +indicate that HypeMeFed enhances accuracy by 5.12% over FedAvg, reduces the +hypernetwork memory requirements by 98.22%, and accelerates its operations by +1.86 times compared to a naive hypernetwork approach. These results demonstrate +HypeMeFed's effectiveness in leveraging and engaging heterogeneous clients for +federated learning. + +
+
+
+
+
+ + ☆ Stable Heterogeneous Treatment Effect Estimation across + Out-of-Distribution Populations ICDE'2024 + + +
+ Heterogeneous treatment effect (HTE) estimation is vital for understanding +the change of treatment effect across individuals or subgroups. Most existing +HTE estimation methods focus on addressing selection bias induced by imbalanced +distributions of confounders between treated and control units, but ignore +distribution shifts across populations. Thereby, their applicability has been +limited to the in-distribution (ID) population, which shares a similar +distribution with the training dataset. In real-world applications, where +population distributions are subject to continuous changes, there is an urgent +need for stable HTE estimation across out-of-distribution (OOD) populations, +which, however, remains an open problem. As pioneers in resolving this problem, +we propose a novel Stable Balanced Representation Learning with +Hierarchical-Attention Paradigm (SBRL-HAP) framework, which consists of 1) +Balancing Regularizer for eliminating selection bias, 2) Independence +Regularizer for addressing the distribution shift issue, 3) +Hierarchical-Attention Paradigm for coordination between balance and +independence. In this way, SBRL-HAP regresses counterfactual outcomes using ID +data, while ensuring the resulting HTE estimation can be successfully +generalized to out-of-distribution scenarios, thereby enhancing the model's +applicability in real-world settings. Extensive experiments conducted on +synthetic and real-world datasets demonstrate the effectiveness of our SBRL-HAP +in achieving stable HTE estimation across OOD populations, with an average 10% +reduction in the error metric PEHE and 11% decrease in the ATE bias, compared +to the SOTA methods. + +
+
+ comment: Accepted by ICDE'2024 +
+
+
+
+
+ + ☆ Artificial Inductive Bias for Synthetic Tabular Data Generation in + Data-Scarce Scenarios + + +
+ While synthetic tabular data generation using Deep Generative Models (DGMs) +offers a compelling solution to data scarcity and privacy concerns, their +effectiveness relies on substantial training data, often unavailable in +real-world applications. This paper addresses this challenge by proposing a +novel methodology for generating realistic and reliable synthetic tabular data +with DGMs in limited real-data environments. Our approach proposes several ways +to generate an artificial inductive bias in a DGM through transfer learning and +meta-learning techniques. We explore and compare four different methods within +this framework, demonstrating that transfer learning strategies like +pre-training and model averaging outperform meta-learning approaches, like +Model-Agnostic Meta-Learning, and Domain Randomized Search. We validate our +approach using two state-of-the-art DGMs, namely, a Variational Autoencoder and +a Generative Adversarial Network, to show that our artificial inductive bias +fuels superior synthetic data quality, as measured by Jensen-Shannon +divergence, achieving relative gains of up to 50\% when using our proposed +approach. This methodology has broad applicability in various DGMs and machine +learning tasks, particularly in areas like healthcare and finance, where data +scarcity is often a critical issue. + +
+
+ comment: 19 pages, 6 Figures +
+
+
+
+
+ + ☆ Warm-up Free Policy Optimization: Improved Regret in Linear Markov + Decision Processes + + +
+ Policy Optimization (PO) methods are among the most popular Reinforcement +Learning (RL) algorithms in practice. Recently, Sherman et al. [2023a] proposed +a PO-based algorithm with rate-optimal regret guarantees under the linear +Markov Decision Process (MDP) model. However, their algorithm relies on a +costly pure exploration warm-up phase that is hard to implement in practice. +This paper eliminates this undesired warm-up phase, replacing it with a simple +and efficient contraction mechanism. Our PO algorithm achieves rate-optimal +regret with improved dependence on the other parameters of the problem (horizon +and function approximation dimension) in two fundamental settings: adversarial +losses with full-information feedback and stochastic losses with bandit +feedback. + +
+
+
+
+
+ + ☆ FairJob: A Real-World Dataset for Fairness in Online Systems + + +
+ We introduce a fairness-aware dataset for job recommendation in advertising, +designed to foster research in algorithmic fairness within real-world +scenarios. It was collected and prepared to comply with privacy standards and +business confidentiality. An additional challenge is the lack of access to +protected user attributes such as gender, for which we propose a solution to +obtain a proxy estimate. Despite being anonymized and including a proxy for a +sensitive attribute, our dataset preserves predictive power and maintains a +realistic and challenging benchmark. This dataset addresses a significant gap +in the availability of fairness-focused resources for high-impact domains like +advertising -- the actual impact being having access or not to precious +employment opportunities, where balancing fairness and utility is a common +industrial challenge. We also explore various stages in the advertising process +where unfairness can occur and introduce a method to compute a fair utility +metric for the job recommendations in online systems case from a biased +dataset. Experimental evaluations of bias mitigation techniques on the released +dataset demonstrate potential improvements in fairness and the associated +trade-offs with utility. + +
+
+ comment: 24 pages, 15 figures +
+
+
+
+
+ + ☆ Improving Zero-shot Generalization of Learned Prompts via Unsupervised + Knowledge Distillation ECCV24 + + +
+ Vision-Language Models (VLMs) demonstrate remarkable zero-shot generalization +to unseen tasks, but fall short of the performance of supervised methods in +generalizing to downstream tasks with limited data. Prompt learning is emerging +as a parameter-efficient method for adapting VLMs, but state-of-the-art +approaches require annotated samples. In this paper we propose a novel approach +to prompt learning based on unsupervised knowledge distillation from more +powerful models. Our approach, which we call Knowledge Distillation Prompt +Learning (KDPL), can be integrated into existing prompt learning techniques and +eliminates the need for labeled examples during adaptation. Our experiments on +more than ten standard benchmark datasets demonstrate that KDPL is very +effective at improving generalization of learned prompts for zero-shot domain +generalization, zero-shot cross-dataset generalization, and zero-shot +base-to-novel class generalization problems. KDPL requires no ground-truth +labels for adaptation, and moreover we show that even in the absence of any +knowledge of training class names it can be used to effectively transfer +knowledge. The code is publicly available at https://github.com/miccunifi/KDPL. + +
+
+ comment: Accepted for publication at ECCV24 +
+
+
+
+
+ + ☆ JailbreakHunter: A Visual Analytics Approach for Jailbreak Prompts + Discovery from Large-Scale Human-LLM Conversational Datasets + + +
+ Large Language Models (LLMs) have gained significant attention but also +raised concerns due to the risk of misuse. Jailbreak prompts, a popular type of +adversarial attack towards LLMs, have appeared and constantly evolved to breach +the safety protocols of LLMs. To address this issue, LLMs are regularly updated +with safety patches based on reported jailbreak prompts. However, malicious +users often keep their successful jailbreak prompts private to exploit LLMs. To +uncover these private jailbreak prompts, extensive analysis of large-scale +conversational datasets is necessary to identify prompts that still manage to +bypass the system's defenses. This task is highly challenging due to the +immense volume of conversation data, diverse characteristics of jailbreak +prompts, and their presence in complex multi-turn conversations. To tackle +these challenges, we introduce JailbreakHunter, a visual analytics approach for +identifying jailbreak prompts in large-scale human-LLM conversational datasets. +We have designed a workflow with three analysis levels: group-level, +conversation-level, and turn-level. Group-level analysis enables users to grasp +the distribution of conversations and identify suspicious conversations using +multiple criteria, such as similarity with reported jailbreak prompts in +previous research and attack success rates. Conversation-level analysis +facilitates the understanding of the progress of conversations and helps +discover jailbreak prompts within their conversation contexts. Turn-level +analysis allows users to explore the semantic similarity and token overlap +between a singleturn prompt and the reported jailbreak prompts, aiding in the +identification of new jailbreak strategies. The effectiveness and usability of +the system were verified through multiple case studies and expert interviews. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ☆ On the Client Preference of LLM Fine-tuning in Federated Learning + + +
+ Reinforcement learning with human feedback (RLHF) fine-tunes a pretrained +large language model (LLM) using preference datasets, enabling the LLM to +generate outputs that align with human preferences. Given the sensitive nature +of these preference datasets held by various clients, there is a need to +implement RLHF within a federated learning (FL) framework, where clients are +reluctant to share their data due to privacy concerns. To address this, we +introduce a feasible framework in which clients collaboratively train a binary +selector with their preference datasets using our proposed FedBis. With a +well-trained selector, we can further enhance the LLM that generates +human-preferred completions. Meanwhile, we propose a novel algorithm, +FedBiscuit, that trains multiple selectors by organizing clients into balanced +and disjoint clusters based on their preferences. Compared to the FedBis, +FedBiscuit demonstrates superior performance in simulating human preferences +for pairwise completions. Our extensive experiments on federated human +preference datasets -- marking the first benchmark to address heterogeneous +data partitioning among clients -- demonstrate that FedBiscuit outperforms +FedBis and even surpasses traditional centralized training. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ LoRA-Guard: Parameter-Efficient Guardrail Adaptation for Content + Moderation of Large Language Models + + +
+ Guardrails have emerged as an alternative to safety alignment for content +moderation of large language models (LLMs). Existing model-based guardrails +have not been designed for resource-constrained computational portable devices, +such as mobile phones, more and more of which are running LLM-based +applications locally. We introduce LoRA-Guard, a parameter-efficient guardrail +adaptation method that relies on knowledge sharing between LLMs and guardrail +models. LoRA-Guard extracts language features from the LLMs and adapts them for +the content moderation task using low-rank adapters, while a dual-path design +prevents any performance degradation on the generative task. We show that +LoRA-Guard outperforms existing approaches with 100-1000x lower parameter +overhead while maintaining accuracy, enabling on-device content moderation. + +
+
+
+
+
+ + ☆ Semantically Rich Local Dataset Generation for Explainable AI in + Genomics + + +
+ Black box deep learning models trained on genomic sequences excel at +predicting the outcomes of different gene regulatory mechanisms. Therefore, +interpreting these models may provide novel insights into the underlying +biology, supporting downstream biomedical applications. Due to their +complexity, interpretable surrogate models can only be built for local +explanations (e.g., a single instance). However, accomplishing this requires +generating a dataset in the neighborhood of the input, which must maintain +syntactic similarity to the original data while introducing semantic +variability in the model's predictions. This task is challenging due to the +complex sequence-to-function relationship of DNA. + We propose using Genetic Programming to generate datasets by evolving +perturbations in sequences that contribute to their semantic diversity. Our +custom, domain-guided individual representation effectively constrains +syntactic similarity, and we provide two alternative fitness functions that +promote diversity with no computational effort. Applied to the RNA splicing +domain, our approach quickly achieves good diversity and significantly +outperforms a random baseline in exploring the search space, as shown by our +proof-of-concept, short RNA sequence. Furthermore, we assess its +generalizability and demonstrate scalability to larger sequences, resulting in +a $\approx$30\% improvement over the baseline. + +
+
+
+
+
+ + ☆ IM-MoCo: Self-supervised MRI Motion Correction using Motion-Guided + Implicit Neural Representations MICCAI 2024 + + +
+ Motion artifacts in Magnetic Resonance Imaging (MRI) arise due to relatively +long acquisition times and can compromise the clinical utility of acquired +images. Traditional motion correction methods often fail to address severe +motion, leading to distorted and unreliable results. Deep Learning (DL) +alleviated such pitfalls through generalization with the cost of vanishing +structures and hallucinations, making it challenging to apply in the medical +field where hallucinated structures can tremendously impact the diagnostic +outcome. In this work, we present an instance-wise motion correction pipeline +that leverages motion-guided Implicit Neural Representations (INRs) to mitigate +the impact of motion artifacts while retaining anatomical structure. Our method +is evaluated using the NYU fastMRI dataset with different degrees of simulated +motion severity. For the correction alone, we can improve over state-of-the-art +image reconstruction methods by $+5\%$ SSIM, $+5\:db$ PSNR, and $+14\%$ +HaarPSI. Clinical relevance is demonstrated by a subsequent experiment, where +our method improves classification outcomes by at least $+1.5$ accuracy +percentage points compared to motion-corrupted images. + +
+
+ comment: Submitted to MICCAI 2024 (Before peer review version) +
+
+
+
+
+ + ☆ Towards a Scalable Reference-Free Evaluation of Generative Models + + +
+ While standard evaluation scores for generative models are mostly +reference-based, a reference-dependent assessment of generative models could be +generally difficult due to the unavailability of applicable reference datasets. +Recently, the reference-free entropy scores, VENDI and RKE, have been proposed +to evaluate the diversity of generated data. However, estimating these scores +from data leads to significant computational costs for large-scale generative +models. In this work, we leverage the random Fourier features framework to +reduce the computational price and propose the Fourier-based Kernel Entropy +Approximation (FKEA) method. We utilize FKEA's approximated eigenspectrum of +the kernel matrix to efficiently estimate the mentioned entropy scores. +Furthermore, we show the application of FKEA's proxy eigenvectors to reveal the +method's identified modes in evaluating the diversity of produced samples. We +provide a stochastic implementation of the FKEA assessment algorithm with a +complexity $O(n)$ linearly growing with sample size $n$. We extensively +evaluate FKEA's numerical performance in application to standard image, text, +and video datasets. Our empirical results indicate the method's scalability and +interpretability applied to large-scale generative models. The codebase is +available at https://github.com/aziksh-ospanov/FKEA. + +
+
+
+
+
+ + ☆ ObfuscaTune: Obfuscated Offsite Fine-tuning and Inference of Proprietary + LLMs on Private Datasets + + +
+ This work addresses the timely yet underexplored problem of performing +inference and finetuning of a proprietary LLM owned by a model provider entity +on the confidential/private data of another data owner entity, in a way that +ensures the confidentiality of both the model and the data. Hereby, the +finetuning is conducted offsite, i.e., on the computation infrastructure of a +third-party cloud provider. We tackle this problem by proposing ObfuscaTune, a +novel, efficient and fully utility-preserving approach that combines a simple +yet effective obfuscation technique with an efficient usage of confidential +computing (only 5% of the model parameters are placed on TEE). We empirically +demonstrate the effectiveness of ObfuscaTune by validating it on GPT-2 models +with different sizes on four NLP benchmark datasets. Finally, we compare to a +na\"ive version of our approach to highlight the necessity of using random +matrices with low condition numbers in our approach to reduce errors induced by +the obfuscation. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ IncogniText: Privacy-enhancing Conditional Text Anonymization via + LLM-based Private Attribute Randomization + + +
+ In this work, we address the problem of text anonymization where the goal is +to prevent adversaries from correctly inferring private attributes of the +author, while keeping the text utility, i.e., meaning and semantics. We propose +IncogniText, a technique that anonymizes the text to mislead a potential +adversary into predicting a wrong private attribute value. Our empirical +evaluation shows a reduction of private attribute leakage by more than 90%. +Finally, we demonstrate the maturity of IncogniText for real-world applications +by distilling its anonymization capability into a set of LoRA parameters +associated with an on-device model. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ PII-Compass: Guiding LLM training data extraction prompts towards the + target PII via grounding ACL 2024 + + +
+ The latest and most impactful advances in large models stem from their +increased size. Unfortunately, this translates into an improved memorization +capacity, raising data privacy concerns. Specifically, it has been shown that +models can output personal identifiable information (PII) contained in their +training data. However, reported PIII extraction performance varies widely, and +there is no consensus on the optimal methodology to evaluate this risk, +resulting in underestimating realistic adversaries. In this work, we +empirically demonstrate that it is possible to improve the extractability of +PII by over ten-fold by grounding the prefix of the manually constructed +extraction prompt with in-domain data. Our approach, PII-Compass, achieves +phone number extraction rates of 0.92%, 3.9%, and 6.86% with 1, 128, and 2308 +queries, respectively, i.e., the phone number of 1 person in 15 is extractable. + +
+
+ comment: Accepted at ACL 2024 +
+
+
+
+
+ + ☆ The More the Merrier? Navigating Accuracy vs. Energy Efficiency Design + Trade-Offs in Ensemble Learning Systems + + +
+ Background: Machine learning (ML) model composition is a popular technique to +mitigate shortcomings of a single ML model and to design more effective +ML-enabled systems. While ensemble learning, i.e., forwarding the same request +to several models and fusing their predictions, has been studied extensively +for accuracy, we have insufficient knowledge about how to design +energy-efficient ensembles. Objective: We therefore analyzed three types of +design decisions for ensemble learning regarding a potential trade-off between +accuracy and energy consumption: a) ensemble size, i.e., the number of models +in the ensemble, b) fusion methods (majority voting vs. a meta-model), and c) +partitioning methods (whole-dataset vs. subset-based training). Methods: By +combining four popular ML algorithms for classification in different ensembles, +we conducted a full factorial experiment with 11 ensembles x 4 datasets x 2 +fusion methods x 2 partitioning methods (176 combinations). For each +combination, we measured accuracy (F1-score) and energy consumption in J (for +both training and inference). Results: While a larger ensemble size +significantly increased energy consumption (size 2 ensembles consumed 37.49% +less energy than size 3 ensembles, which in turn consumed 26.96% less energy +than the size 4 ensembles), it did not significantly increase accuracy. +Furthermore, majority voting outperformed meta-model fusion both in terms of +accuracy (Cohen's d of 0.38) and energy consumption (Cohen's d of 0.92). +Lastly, subset-based training led to significantly lower energy consumption +(Cohen's d of 0.91), while training on the whole dataset did not increase +accuracy significantly. Conclusions: From a Green AI perspective, we recommend +designing ensembles of small size (2 or maximum 3 models), using subset-based +training, majority voting, and energy-efficient ML algorithms like decision +trees, Naive Bayes, or KNN. + +
+
+ comment: Currently under review at a journal +
+
+
+
+
+ + ☆ SFC: Achieve Accurate Fast Convolution under Low-precision Arithmetic ICML 2024 + + +
+ Fast convolution algorithms, including Winograd and FFT, can efficiently +accelerate convolution operations in deep models. However, these algorithms +depend on high-precision arithmetic to maintain inference accuracy, which +conflicts with the model quantization. To resolve this conflict and further +improve the efficiency of quantized convolution, we proposes SFC, a new algebra +transform for fast convolution by extending the Discrete Fourier Transform +(DFT) with symbolic computing, in which only additions are required to perform +the transformation at specific transform points, avoiding the calculation of +irrational number and reducing the requirement for precision. Additionally, we +enhance convolution efficiency by introducing correction terms to convert +invalid circular convolution outputs of the Fourier method into effective ones. +The numerical error analysis is presented for the first time in this type of +work and proves that our algorithms can provide a 3.68x multiplication +reduction for 3x3 convolution, while the Winograd algorithm only achieves a +2.25x reduction with similarly low numerical errors. Experiments carried out on +benchmarks and FPGA show that our new algorithms can further improve the +computation efficiency of quantized models while maintaining accuracy, +surpassing both the quantization-alone method and existing works on fast +convolution quantization. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ☆ The Shortcomings of Force-from-Motion in Robot Learning + + +
+ Robotic manipulation requires accurate motion and physical interaction +control. However, current robot learning approaches focus on motion-centric +action spaces that do not explicitly give the policy control over the +interaction. In this paper, we discuss the repercussions of this choice and +argue for more interaction-explicit action spaces in robot learning. + +
+
+
+
+
+ + ☆ Self-supervised Vision Transformer are Scalable Generative Models for + Domain Generalization MICCAI 2024 + + +
+ Despite notable advancements, the integration of deep learning (DL) +techniques into impactful clinical applications, particularly in the realm of +digital histopathology, has been hindered by challenges associated with +achieving robust generalization across diverse imaging domains and +characteristics. Traditional mitigation strategies in this field such as data +augmentation and stain color normalization have proven insufficient in +addressing this limitation, necessitating the exploration of alternative +methodologies. To this end, we propose a novel generative method for domain +generalization in histopathology images. Our method employs a generative, +self-supervised Vision Transformer to dynamically extract characteristics of +image patches and seamlessly infuse them into the original images, thereby +creating novel, synthetic images with diverse attributes. By enriching the +dataset with such synthesized images, we aim to enhance its holistic nature, +facilitating improved generalization of DL models to unseen domains. Extensive +experiments conducted on two distinct histopathology datasets demonstrate the +effectiveness of our proposed approach, outperforming the state of the art +substantially, on the Camelyon17-wilds challenge dataset (+2%) and on a second +epithelium-stroma dataset (+26%). Furthermore, we emphasize our method's +ability to readily scale with increasingly available unlabeled data samples and +more complex, higher parametric architectures. Source code is available at +https://github.com/sdoerrich97/vits-are-generative-models . + +
+
+ comment: Accepted at MICCAI 2024. This is the submitted manuscript with added + link to github repo and funding acknowledgements. No further post submission + improvements or corrections were integrated. Final version not published yet +
+
+
+
+
+ + ☆ GPTQT: Quantize Large Language Models Twice to Push the Efficiency + + +
+ Due to their large size, generative Large Language Models (LLMs) require +significant computing and storage resources. This paper introduces a new +post-training quantization method, GPTQT, to reduce memory usage and enhance +processing speed by expressing the weight of LLM in 3bit/2bit. Practice has +shown that minimizing the quantization error of weights is ineffective, leading +to overfitting. Therefore, GPTQT employs a progressive two-step approach: +initially quantizing weights using Linear quantization to a relatively high +bit, followed by converting obtained int weight to lower bit binary coding. A +re-explore strategy is proposed to optimize initial scaling factor. During +inference, these steps are merged into pure binary coding, enabling efficient +computation. Testing across various models and datasets confirms GPTQT's +effectiveness. Compared to the strong 3-bit quantization baseline, GPTQT +further reduces perplexity by 4.01 on opt-66B and increases speed by 1.24 times +on opt-30b. The results on Llama2 show that GPTQT is currently the best binary +coding quantization method for such kind of LLMs. + +
+
+ comment: Accepted by 11th IEEE International Conference on Cybernetics and + Intelligent Systems +
+
+
+
+
+ + ☆ Joint Optimization of Resource Allocation and Data Selection for Fast + and Cost-Efficient Federated Edge Learning + + +
+ Deploying federated learning at the wireless edge introduces federated edge +learning (FEEL). Given FEEL's limited communication resources and potential +mislabeled data on devices, improper resource allocation or data selection can +hurt convergence speed and increase training costs. Thus, to realize an +efficient FEEL system, this paper emphasizes jointly optimizing resource +allocation and data selection. Specifically, in this work, through rigorously +modeling the training process and deriving an upper bound on FEEL's one-round +convergence rate, we establish a problem of joint resource allocation and data +selection, which, unfortunately, cannot be solved directly. Toward this end, we +equivalently transform the original problem into a solvable form via a variable +substitution and then break it into two subproblems, that is, the resource +allocation problem and the data selection problem. The two subproblems are +mixed-integer non-convex and integer non-convex problems, respectively, and +achieving their optimal solutions is a challenging task. Based on the matching +theory and applying the convex-concave procedure and gradient projection +methods, we devise a low-complexity suboptimal algorithm for the two +subproblems, respectively. Finally, the superiority of our proposed scheme of +joint resource allocation and data selection is validated by numerical results. + +
+
+
+
+
+ + ☆ ShiftAddAug: Augment Multiplication-Free Tiny Neural Network with Hybrid + Computation CVPR + + +
+ Operators devoid of multiplication, such as Shift and Add, have gained +prominence for their compatibility with hardware. However, neural networks +(NNs) employing these operators typically exhibit lower accuracy compared to +conventional NNs with identical structures. ShiftAddAug uses costly +multiplication to augment efficient but less powerful multiplication-free +operators, improving performance without any inference overhead. It puts a +ShiftAdd tiny NN into a large multiplicative model and encourages it to be +trained as a sub-model to obtain additional supervision. In order to solve the +weight discrepancy problem between hybrid operators, a new weight sharing +method is proposed. Additionally, a novel two stage neural architecture search +is used to obtain better augmentation effects for smaller but stronger +multiplication-free tiny neural networks. The superiority of ShiftAddAug is +validated through experiments in image classification and semantic +segmentation, consistently delivering noteworthy enhancements. Remarkably, it +secures up to a 4.95% increase in accuracy on the CIFAR100 compared to its +directly trained counterparts, even surpassing the performance of +multiplicative NNs. + +
+
+ comment: Accepted by 2024 CVPR Workshop : Efficient Deep Learning for Computer + Vision +
+
+
+
+
+ + ☆ Knowledge Composition using Task Vectors with Learned Anisotropic + Scaling + + +
+ Pre-trained models produce strong generic representations that can be adapted +via fine-tuning. The learned weight difference relative to the pre-trained +model, known as a task vector, characterises the direction and stride of +fine-tuning. The significance of task vectors is such that simple arithmetic +operations on them can be used to combine diverse representations from +different domains. This paper builds on these properties of task vectors and +aims to answer (1) whether components of task vectors, particularly parameter +blocks, exhibit similar characteristics, and (2) how such blocks can be used to +enhance knowledge composition and transfer. To this end, we introduce aTLAS, an +algorithm that linearly combines parameter blocks with different learned +coefficients, resulting in anisotropic scaling at the task vector level. We +show that such linear combinations explicitly exploit the low intrinsic +dimensionality of pre-trained models, with only a few coefficients being the +learnable parameters. Furthermore, composition of parameter blocks leverages +the already learned representations, thereby reducing the dependency on large +amounts of data. We demonstrate the effectiveness of our method in task +arithmetic, few-shot recognition and test-time adaptation, with supervised or +unsupervised objectives. In particular, we show that (1) learned anisotropic +scaling allows task vectors to be more disentangled, causing less interference +in composition; (2) task vector composition excels with scarce or no labeled +data and is less prone to domain shift, thus leading to better +generalisability; (3) mixing the most informative parameter blocks across +different task vectors prior to training can reduce the memory footprint and +improve the flexibility of knowledge transfer. Moreover, we show the potential +of aTLAS as a PEFT method, particularly with less data, and demonstrate that +its scalibility. + +
+
+
+
+
+ + ☆ Membership Inference Attacks Against Time-Series Models + + +
+ Analyzing time-series data that may contain personal information, +particularly in the medical field, presents serious privacy concerns. Sensitive +health data from patients is often used to train machine-learning models for +diagnostics and ongoing care. Assessing the privacy risk of such models is +crucial to making knowledgeable decisions on whether to use a model in +production, share it with third parties, or deploy it in patients homes. +Membership Inference Attacks (MIA) are a key method for this kind of +evaluation, however time-series prediction models have not been thoroughly +studied in this context. We explore existing MIA techniques on time-series +models, and introduce new features, focusing on the seasonality and trend +components of the data. Seasonality is estimated using a multivariate Fourier +transform, and a low-degree polynomial is used to approximate trends. We +applied these techniques to various types of time-series models, using datasets +from the health domain. Our results demonstrate that these new features enhance +the effectiveness of MIAs in identifying membership, improving the +understanding of privacy risks in medical data applications. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ A Self-Supervised Task for Fault Detection in Satellite Multivariate + Time Series SP + + +
+ In the space sector, due to environmental conditions and restricted +accessibility, robust fault detection methods are imperative for ensuring +mission success and safeguarding valuable assets. This work proposes a novel +approach leveraging Physics-Informed Real NVP neural networks, renowned for +their ability to model complex and high-dimensional distributions, augmented +with a self-supervised task based on sensors' data permutation. It focuses on +enhancing fault detection within the satellite multivariate time series. The +experiments involve various configurations, including pre-training with +self-supervision, multi-task learning, and standalone self-supervised training. +Results indicate significant performance improvements across all settings. In +particular, employing only the self-supervised loss yields the best overall +results, suggesting its efficacy in guiding the network to extract relevant +features for fault detection. This study presents a promising direction for +improving fault detection in space systems and warrants further exploration in +other datasets and applications. + +
+
+ comment: SPAICE: AI in and for Space, 2024 +
+
+
+
+
+ + ☆ Early-Stage Anomaly Detection: A Study of Model Performance on Complete + vs. Partial Flows + + +
+ This study investigates the efficacy of machine learning models, specifically +Random Forest, in anomaly detection systems when trained on complete flow +records and tested on partial flow data. We explore the performance disparity +that arises when models are applied to incomplete data typical in real-world, +real-time network environments. Our findings demonstrate a significant decline +in model performance, with precision and recall dropping by up to 30\% under +certain conditions when models trained on complete flows are tested against +partial flows. Conversely, models trained and tested on consistently complete +or partial datasets maintain robustness, highlighting the importance of dataset +consistency in training. The study reveals that a minimum of 7 packets in the +test set is required for maintaining reliable detection rates. These results +underscore the need for tailored training strategies that can effectively adapt +to the dynamics of partial data, enhancing the practical applicability of +anomaly detection systems in operational settings. + +
+
+ comment: 9 pages, 5 tables, 2 figures +
+
+
+
+
+ + ☆ Safe Unlearning: A Surprisingly Effective and Generalizable Solution to + Defend Against Jailbreak Attacks + + +
+ LLMs are known to be vulnerable to jailbreak attacks, even after safety +alignment. An important observation is that, while different types of jailbreak +attacks can generate significantly different queries, they mostly result in +similar responses that are rooted in the same harmful knowledge (e.g., detailed +steps to make a bomb). Therefore, we conjecture that directly unlearn the +harmful knowledge in the LLM can be a more effective way to defend against +jailbreak attacks than the mainstream supervised fine-tuning (SFT) based +approaches. Our extensive experiments confirmed our insight and suggested +surprising generalizability of our unlearning-based approach: using only 20 raw +harmful questions \emph{without} any jailbreak prompt during training, our +solution reduced the Attack Success Rate (ASR) in Vicuna-7B on +\emph{out-of-distribution} (OOD) harmful questions wrapped with various complex +jailbreak prompts from 82.6\% to 7.7\%. This significantly outperforms +Llama2-7B-Chat, which is fine-tuned on about 0.1M safety alignment samples but +still has an ASR of 21.9\% even under the help of an additional safety system +prompt. Further analysis reveals that the generalization ability of our +solution stems from the intrinsic relatedness among harmful responses across +harmful questions (e.g., response patterns, shared steps and actions, and +similarity among their learned representations in the LLM). Our code is +available at \url{https://github.com/thu-coai/SafeUnlearning}. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Multi-Attention Integrated Deep Learning Frameworks for Enhanced Breast + Cancer Segmentation and Identification + + +
+ Breast cancer poses a profound threat to lives globally, claiming numerous +lives each year. Therefore, timely detection is crucial for early intervention +and improved chances of survival. Accurately diagnosing and classifying breast +tumors using ultrasound images is a persistent challenge in medicine, demanding +cutting-edge solutions for improved treatment strategies. This research +introduces multiattention-enhanced deep learning (DL) frameworks designed for +the classification and segmentation of breast cancer tumors from ultrasound +images. A spatial channel attention mechanism is proposed for segmenting tumors +from ultrasound images, utilizing a novel LinkNet DL framework with an +InceptionResNet backbone. Following this, the paper proposes a deep +convolutional neural network with an integrated multi-attention framework +(DCNNIMAF) to classify the segmented tumor as benign, malignant, or normal. +From experimental results, it is observed that the segmentation model has +recorded an accuracy of 98.1%, with a minimal loss of 0.6%. It has also +achieved high Intersection over Union (IoU) and Dice Coefficient scores of +96.9% and 97.2%, respectively. Similarly, the classification model has attained +an accuracy of 99.2%, with a low loss of 0.31%. Furthermore, the classification +framework has achieved outstanding F1-Score, precision, and recall values of +99.1%, 99.3%, and 99.1%, respectively. By offering a robust framework for early +detection and accurate classification of breast cancer, this proposed work +significantly advances the field of medical image analysis, potentially +improving diagnostic precision and patient outcomes. + +
+
+ comment: 32 pages, 18 figures, 6 tables +
+
+
+
+
+ + ☆ LANE: Logic Alignment of Non-tuning Large Language Models and Online + Recommendation Systems for Explainable Reason Generation + + +
+ The explainability of recommendation systems is crucial for enhancing user +trust and satisfaction. Leveraging large language models (LLMs) offers new +opportunities for comprehensive recommendation logic generation. However, in +existing related studies, fine-tuning LLM models for recommendation tasks +incurs high computational costs and alignment issues with existing systems, +limiting the application potential of proven proprietary/closed-source LLM +models, such as GPT-4. In this work, our proposed effective strategy LANE +aligns LLMs with online recommendation systems without additional LLMs tuning, +reducing costs and improving explainability. This innovative approach addresses +key challenges in integrating language models with recommendation systems while +fully utilizing the capabilities of powerful proprietary models. Specifically, +our strategy operates through several key components: semantic embedding, user +multi-preference extraction using zero-shot prompting, semantic alignment, and +explainable recommendation generation using Chain of Thought (CoT) prompting. +By embedding item titles instead of IDs and utilizing multi-head attention +mechanisms, our approach aligns the semantic features of user preferences with +those of candidate items, ensuring coherent and user-aligned recommendations. +Sufficient experimental results including performance comparison, questionnaire +voting, and visualization cases prove that our method can not only ensure +recommendation performance, but also provide easy-to-understand and reasonable +recommendation logic. + +
+
+
+
+
+ + ☆ Convergence of Implicit Gradient Descent for Training Two-Layer + Physics-Informed Neural Networks + + +
+ Optimization algorithms is crucial in training physics-informed neural +networks (PINNs), unsuitable methods may lead to poor solutions. Compared to +the common gradient descent algorithm, implicit gradient descent (IGD) +outperforms it in handling some multi-scale problems. In this paper, we provide +convergence analysis for the implicit gradient descent for training +over-parametrized two-layer PINNs. We first demonstrate the positive +definiteness of Gram matrices for general smooth activation functions, like +sigmoidal function, softplus function, tanh function and so on. Then the +over-parameterization allows us to show that the randomly initialized IGD +converges a globally optimal solution at a linear convergence rate. Moreover, +due to the different training dynamics, the learning rate of IGD can be chosen +independent of the sample size and the least eigenvalue of the Gram matrix. + +
+
+
+
+
+ + ☆ Representation learning with CGAN for casual inference + + +
+ Conditional Generative Adversarial Nets (CGAN) is often used to improve +conditional image generation performance. However, there is little research on +Representation learning with CGAN for causal inference. This paper proposes a +new method for finding representation learning functions by adopting the +adversarial idea. We apply the pattern of CGAN and theoretically emonstrate the +feasibility of finding a suitable representation function in the context of two +distributions being balanced. The theoretical result shows that when two +distributions are balanced, the ideal representation function can be found and +thus can be used to further research. + +
+
+ comment: Proceedings of the 3rd International Conference on Signal Processing + and Machine Learning +
+
+
+
+
+ + ☆ Effect of a Process Mining based Pre-processing Step in Prediction of + the Critical Health Outcomes + + +
+ Predicting critical health outcomes such as patient mortality and hospital +readmission is essential for improving survivability. However, healthcare +datasets have many concurrences that create complexities, leading to poor +predictions. Consequently, pre-processing the data is crucial to improve its +quality. In this study, we use an existing pre-processing algorithm, +concatenation, to improve data quality by decreasing the complexity of +datasets. Sixteen healthcare datasets were extracted from two databases - MIMIC +III and University of Illinois Hospital - converted to the event logs, they +were then fed into the concatenation algorithm. The pre-processed event logs +were then fed to the Split Miner (SM) algorithm to produce a process model. +Process model quality was evaluated before and after concatenation using the +following metrics: fitness, precision, F-Measure, and complexity. The +pre-processed event logs were also used as inputs to the Decay Replay Mining +(DREAM) algorithm to predict critical outcomes. We compared predicted results +before and after applying the concatenation algorithm using Area Under the +Curve (AUC) and Confidence Intervals (CI). Results indicated that the +concatenation algorithm improved the quality of the process models and +predictions of the critical health outcomes. + +
+
+
+
+
+ + ☆ Efficient Training of Language Models with Compact and Consistent Next + Token Distributions ACL 2024 + + +
+ Maximizing the likelihood of the next token is an established, statistically +sound objective for pre-training language models. In this paper we show that we +can train better models faster by pre-aggregating the corpus with a collapsed +$n$-gram distribution. Previous studies have proposed corpus-level $n$-gram +statistics as a regularizer; however, the construction and querying of such +$n$-grams, if done naively, prove to be costly and significantly impede +training speed, thereby limiting their application in modern large language +model pre-training. + We introduce an alternative compact representation of the next token +distribution that, in expectation, aligns with the complete $n$-gram +distribution while markedly reducing variance across mini-batches compared to +the standard next-token loss. Empirically, we demonstrate that both the +$n$-gram regularized model and our approximation yield substantial improvements +in model quality and convergence rate compared to existing methods. +Furthermore, our approximation facilitates scalability of gains to larger +datasets and models compared to the straightforward $n$-gram regularization +method. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ☆ Data Overfitting for On-Device Super-Resolution with Dynamic Algorithm + and Compiler Co-Design ECCV2024 + + +
+ Deep neural networks (DNNs) are frequently employed in a variety of computer +vision applications. Nowadays, an emerging trend in the current video +distribution system is to take advantage of DNN's overfitting properties to +perform video resolution upscaling. By splitting videos into chunks and +applying a super-resolution (SR) model to overfit each chunk, this scheme of SR +models plus video chunks is able to replace traditional video transmission to +enhance video quality and transmission efficiency. However, many models and +chunks are needed to guarantee high performance, which leads to tremendous +overhead on model switching and memory footprints at the user end. To resolve +such problems, we propose a Dynamic Deep neural network assisted by a +Content-Aware data processing pipeline to reduce the model number down to one +(Dy-DCA), which helps promote performance while conserving computational +resources. Additionally, to achieve real acceleration on the user end, we +designed a framework that optimizes dynamic features (e.g., dynamic shapes, +sizes, and control flow) in Dy-DCA to enable a series of compilation +optimizations, including fused code generation, static execution planning, etc. +By employing such techniques, our method achieves better PSNR and real-time +performance (33 FPS) on an off-the-shelf mobile phone. Meanwhile, assisted by +our compilation optimization, we achieve a 1.7$\times$ speedup while saving up +to 1.61$\times$ memory consumption. Code available in +https://github.com/coulsonlee/Dy-DCA-ECCV2024. + +
+
+ comment: ECCV2024 +
+
+
+
+
+ + ☆ SPLITZ: Certifiable Robustness via Split Lipschitz Randomized Smoothing + + +
+ Certifiable robustness gives the guarantee that small perturbations around an +input to a classifier will not change the prediction. There are two approaches +to provide certifiable robustness to adversarial examples: a) explicitly +training classifiers with small Lipschitz constants, and b) Randomized +smoothing, which adds random noise to the input to create a smooth classifier. +We propose \textit{SPLITZ}, a practical and novel approach which leverages the +synergistic benefits of both the above ideas into a single framework. Our main +idea is to \textit{split} a classifier into two halves, constrain the Lipschitz +constant of the first half, and smooth the second half via randomization. +Motivation for \textit{SPLITZ} comes from the observation that many standard +deep networks exhibit heterogeneity in Lipschitz constants across layers. +\textit{SPLITZ} can exploit this heterogeneity while inheriting the scalability +of randomized smoothing. We present a principled approach to train +\textit{SPLITZ} and provide theoretical analysis to derive certified robustness +guarantees during inference. We present a comprehensive comparison of +robustness-accuracy tradeoffs and show that \textit{SPLITZ} consistently +improves upon existing state-of-the-art approaches on MNIST and CIFAR-10 +datasets. For instance, with $\ell_2$ norm perturbation budget of +\textbf{$\epsilon=1$}, \textit{SPLITZ} achieves $\textbf{43.2\%}$ top-1 test +accuracy on CIFAR-10 dataset compared to state-of-art top-1 test accuracy +$\textbf{39.8\%} + +
+
+
+
+
+ + ☆ Croppable Knowledge Graph Embedding + + +
+ Knowledge Graph Embedding (KGE) is a common method for Knowledge Graphs (KGs) +to serve various artificial intelligence tasks. The suitable dimensions of the +embeddings depend on the storage and computing conditions of the specific +application scenarios. Once a new dimension is required, a new KGE model needs +to be trained from scratch, which greatly increases the training cost and +limits the efficiency and flexibility of KGE in serving various scenarios. In +this work, we propose a novel KGE training framework MED, through which we +could train once to get a croppable KGE model applicable to multiple scenarios +with different dimensional requirements, sub-models of the required dimensions +can be cropped out of it and used directly without any additional training. In +MED, we propose a mutual learning mechanism to improve the low-dimensional +sub-models performance and make the high-dimensional sub-models retain the +capacity that low-dimensional sub-models have, an evolutionary improvement +mechanism to promote the high-dimensional sub-models to master the knowledge +that the low-dimensional sub-models can not learn, and a dynamic loss weight to +balance the multiple losses adaptively. Experiments on 3 KGE models over 4 +standard KG completion datasets, 3 real application scenarios over a real-world +large-scale KG, and the experiments of extending MED to the language model BERT +show the effectiveness, high efficiency, and flexible extensibility of MED. + +
+
+
+
+
+ + ☆ Foster Adaptivity and Balance in Learning with Noisy Labels ECCV + + +
+ Label noise is ubiquitous in real-world scenarios, posing a practical +challenge to supervised models due to its effect in hurting the generalization +performance of deep neural networks. Existing methods primarily employ the +sample selection paradigm and usually rely on dataset-dependent prior knowledge +(\eg, a pre-defined threshold) to cope with label noise, inevitably degrading +the adaptivity. Moreover, existing methods tend to neglect the class balance in +selecting samples, leading to biased model performance. To this end, we propose +a simple yet effective approach named \textbf{SED} to deal with label noise in +a \textbf{S}elf-adaptiv\textbf{E} and class-balance\textbf{D} manner. +Specifically, we first design a novel sample selection strategy to empower +self-adaptivity and class balance when identifying clean and noisy data. A +mean-teacher model is then employed to correct labels of noisy samples. +Subsequently, we propose a self-adaptive and class-balanced sample re-weighting +mechanism to assign different weights to detected noisy samples. Finally, we +additionally employ consistency regularization on selected clean samples to +improve model generalization performance. Extensive experimental results on +synthetic and real-world datasets demonstrate the effectiveness and superiority +of our proposed method. The source code has been made available at +https://github.com/NUST-Machine-Intelligence-Laboratory/SED. + +
+
+ comment: accepted by the European Conference on Computer Vision (ECCV), 2024 +
+
+
+
+
+ + ☆ MLKD-BERT: Multi-level Knowledge Distillation for Pre-trained Language + Models + + +
+ Knowledge distillation is an effective technique for pre-trained language +model compression. Although existing knowledge distillation methods perform +well for the most typical model BERT, they could be further improved in two +aspects: the relation-level knowledge could be further explored to improve +model performance; and the setting of student attention head number could be +more flexible to decrease inference time. Therefore, we are motivated to +propose a novel knowledge distillation method MLKD-BERT to distill multi-level +knowledge in teacher-student framework. Extensive experiments on GLUE benchmark +and extractive question answering tasks demonstrate that our method outperforms +state-of-the-art knowledge distillation methods on BERT. In addition, MLKD-BERT +can flexibly set student attention head number, allowing for substantial +inference time decrease with little performance drop. + +
+
+
+
+
+ + ☆ Automatic gradient descent with generalized Newton's method + + +
+ We propose the generalized Newton's method (GeN) -- a Hessian-informed +approach that applies to any optimizer such as SGD and Adam, and covers the +Newton-Raphson method as a sub-case. Our method automatically and dynamically +selects the learning rate that accelerates the convergence, without the +intensive tuning of the learning rate scheduler. In practice, out method is +easily implementable, since it only requires additional forward passes with +almost zero computational overhead (in terms of training time and memory cost), +if the overhead is amortized over many iterations. We present extensive +experiments on language and vision tasks (e.g. GPT and ResNet) to showcase that +GeN optimizers match the state-of-the-art performance, which was achieved with +carefully tuned learning rate schedulers. Code to be released at +\url{https://github.com/ShiyunXu/AutoGeN}. + +
+
+
+
+
+ + ☆ Large language models, physics-based modeling, experimental + measurements: the trinity of data-scarce learning of polymer properties + + +
+ Large language models (LLMs) bear promise as a fast and accurate material +modeling paradigm for evaluation, analysis, and design. Their vast number of +trainable parameters necessitates a wealth of data to achieve accuracy and +mitigate overfitting. However, experimental measurements are often limited and +costly to obtain in sufficient quantities for finetuning. To this end, we +present a physics-based training pipeline that tackles the pathology of data +scarcity. The core enabler is a physics-based modeling framework that generates +a multitude of synthetic data to align the LLM to a physically consistent +initial state before finetuning. Our framework features a two-phase training +strategy: (1) utilizing the large-in-amount while less accurate synthetic data +for supervised pretraining, and (2) finetuning the phase-1 model with limited +experimental data. We empirically demonstrate that supervised pretraining is +vital to obtaining accurate finetuned LLMs, via the lens of learning polymer +flammability metrics where cone calorimeter data is sparse. + +
+
+
+
+
+ + ☆ SF-GNN: Self Filter for Message Lossless Propagation in Deep Graph + Neural Network + + +
+ Graph Neural Network (GNN), with the main idea of encoding graph structure +information of graphs by propagation and aggregation, has developed rapidly. It +achieved excellent performance in representation learning of multiple types of +graphs such as homogeneous graphs, heterogeneous graphs, and more complex +graphs like knowledge graphs. However, merely stacking GNN layers may not +improve the model's performance and can even be detrimental. For the phenomenon +of performance degradation in deep GNNs, we propose a new perspective. Unlike +the popular explanations of over-smoothing or over-squashing, we think the +issue arises from the interference of low-quality node representations during +message propagation. We introduce a simple and general method, SF-GNN, to +address this problem. In SF-GNN, we define two representations for each node, +one is the node representation that represents the feature of the node itself, +and the other is the message representation specifically for propagating +messages to neighbor nodes. A self-filter module evaluates the quality of the +node representation and decides whether to integrate it into the message +propagation based on this quality assessment. Experiments on node +classification tasks for both homogeneous and heterogeneous graphs, as well as +link prediction tasks on knowledge graphs, demonstrate that our method can be +applied to various GNN models and outperforms state-of-the-art baseline methods +in addressing deep GNN degradation. + +
+
+
+
+
+ + ☆ Multi-Scenario Combination Based on Multi-Agent Reinforcement Learning + to Optimize the Advertising Recommendation System + + +
+ This paper explores multi-scenario optimization on large platforms using +multi-agent reinforcement learning (MARL). We address this by treating +scenarios like search, recommendation, and advertising as a cooperative, +partially observable multi-agent decision problem. We introduce the Multi-Agent +Recurrent Deterministic Policy Gradient (MARDPG) algorithm, which aligns +different scenarios under a shared objective and allows for strategy +communication to boost overall performance. Our results show marked +improvements in metrics such as click-through rate (CTR), conversion rate, and +total sales, confirming our method's efficacy in practical settings. + +
+
+ comment: Accepted by 2024 5th International Conference on Artificial + Intelligence and Electromechanical Automation IEEE (ISBN: 979-8-3503-6617-4) +
+
+
+
+
+ + ☆ Differential Encoding for Improved Representation Learning over Graphs + + +
+ Combining the message-passing paradigm with the global attention mechanism +has emerged as an effective framework for learning over graphs. The +message-passing paradigm and the global attention mechanism fundamentally +generate node embeddings based on information aggregated from a node's local +neighborhood or from the whole graph. The most basic and commonly used +aggregation approach is to take the sum of information from a node's local +neighbourhood or from the whole graph. However, it is unknown if the dominant +information is from a node itself or from the node's neighbours (or the rest of +the graph nodes). Therefore, there exists information lost at each layer of +embedding generation, and this information lost could be accumulated and become +more serious when more layers are used in the model. In this paper, we present +a differential encoding method to address the issue of information lost. The +idea of our method is to encode the differential representation between the +information from a node's neighbours (or the rest of the graph nodes) and that +from the node itself. The obtained differential encoding is then combined with +the original aggregated local or global representation to generate the updated +node embedding. By integrating differential encodings, the representational +ability of generated node embeddings is improved. The differential encoding +method is empirically evaluated on different graph tasks on seven benchmark +datasets. The results show that it is a general method that improves the +message-passing update and the global attention update, advancing the +state-of-the-art performance for graph representation learning on these +datasets. + +
+
+
+
+
+ + ♻ ☆ Dual Latent State Learning: Exploiting Regional Network Similarities for + QoS Prediction + + +
+ Individual objects, whether users or services, within a specific region often +exhibit similar network states due to their shared origin from the same city or +autonomous system (AS). Despite this regional network similarity, many existing +techniques overlook its potential, resulting in subpar performance arising from +challenges such as data sparsity and label imbalance. In this paper, we +introduce the regional-based dual latent state learning network(R2SL), a novel +deep learning framework designed to overcome the pitfalls of traditional +individual object-based prediction techniques in Quality of Service (QoS) +prediction. Unlike its predecessors, R2SL captures the nuances of regional +network behavior by deriving two distinct regional network latent states: the +city-network latent state and the AS-network latent state. These states are +constructed utilizing aggregated data from common regions rather than +individual object data. Furthermore, R2SL adopts an enhanced Huber loss +function that adjusts its linear loss component, providing a remedy for +prevalent label imbalance issues. To cap off the prediction process, a +multi-scale perception network is leveraged to interpret the integrated feature +map, a fusion of regional network latent features and other pertinent +information, ultimately accomplishing the QoS prediction. Through rigorous +testing on real-world QoS datasets, R2SL demonstrates superior performance +compared to prevailing state-of-the-art methods. Our R2SL approach ushers in an +innovative avenue for precise QoS predictions by fully harnessing the regional +network similarities inherent in objects. + +
+
+
+
+
+ + ♻ ☆ Found in the Middle: Calibrating Positional Attention Bias Improves Long + Context Utilization ACL + + +
+ Large language models (LLMs), even when specifically trained to process long +input contexts, struggle to capture relevant information located in the middle +of their input. This phenomenon has been known as the lost-in-the-middle +problem. In this work, we make three contributions. First, we set out to +understand the factors that cause this phenomenon. In doing so, we establish a +connection between lost-in-the-middle to LLMs' intrinsic attention bias: LLMs +exhibit a U-shaped attention bias where the tokens at the beginning and at the +end of its input receive higher attention, regardless of their relevance. +Second, we mitigate this positional bias through a calibration mechanism, +found-in-the-middle, that allows the model to attend to contexts faithfully +according to their relevance, even though when they are in the middle. Third, +we show found-in-the-middle not only achieves better performance in locating +relevant information within a long context, but also eventually leads to +improved retrieval-augmented generation (RAG) performance across various tasks, +outperforming existing methods by up to 15 percentage points. These findings +open up future directions in understanding LLM attention bias and its potential +consequences. + +
+
+ comment: ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ Large-scale Pre-trained Models are Surprisingly Strong in Incremental + Novel Class Discovery ICPR 2024 + + +
+ Discovering novel concepts in unlabelled datasets and in a continuous manner +is an important desideratum of lifelong learners. In the literature such +problems have been partially addressed under very restricted settings, where +novel classes are learned by jointly accessing a related labelled set (e.g., +NCD) or by leveraging only a supervisedly pre-trained model (e.g., class-iNCD). +In this work we challenge the status quo in class-iNCD and propose a learning +paradigm where class discovery occurs continuously and truly unsupervisedly, +without needing any related labelled set. In detail, we propose to exploit the +richer priors from strong self-supervised pre-trained models (PTM). To this +end, we propose simple baselines, composed of a frozen PTM backbone and a +learnable linear classifier, that are not only simple to implement but also +resilient under longer learning scenarios. We conduct extensive empirical +evaluation on a multitude of benchmarks and show the effectiveness of our +proposed baselines when compared with sophisticated state-of-the-art methods. +The code is open source. + +
+
+ comment: Accepted as a conference paper to ICPR 2024 +
+
+
+
+
+ + ♻ ☆ Naturalistic Music Decoding from EEG Data via Latent Diffusion Models + + +
+ In this article, we explore the potential of using latent diffusion models, a +family of powerful generative models, for the task of reconstructing +naturalistic music from electroencephalogram (EEG) recordings. Unlike simpler +music with limited timbres, such as MIDI-generated tunes or monophonic pieces, +the focus here is on intricate music featuring a diverse array of instruments, +voices, and effects, rich in harmonics and timbre. This study represents an +initial foray into achieving general music reconstruction of high-quality using +non-invasive EEG data, employing an end-to-end training approach directly on +raw data without the need for manual pre-processing and channel selection. We +train our models on the public NMED-T dataset and perform quantitative +evaluation proposing neural embedding-based metrics. We additionally perform +song classification based on the generated tracks. Our work contributes to the +ongoing research in neural decoding and brain-computer interfaces, offering +insights into the feasibility of using EEG data for complex auditory +information reconstruction. + +
+
+
+
+
+ + ♻ ☆ An AI Architecture with the Capability to Explain Recognition Results + + +
+ Explainability is needed to establish confidence in machine learning results. +Some explainable methods take a post hoc approach to explain the weights of +machine learning models, others highlight areas of the input contributing to +decisions. These methods do not adequately explain decisions, in plain terms. +Explainable property-based systems have been shown to provide explanations in +plain terms, however, they have not performed as well as leading unexplainable +machine learning methods. This research focuses on the importance of metrics to +explainability and contributes two methods yielding performance gains. The +first method introduces a combination of explainable and unexplainable flows, +proposing a metric to characterize explainability of a decision. The second +method compares classic metrics for estimating the effectiveness of neural +networks in the system, posing a new metric as the leading performer. Results +from the new methods and examples from handwritten datasets are presented. + +
+
+
+
+
+ + ♻ ☆ Adam-mini: Use Fewer Learning Rates To Gain More + + +
+ We propose Adam-mini, an optimizer that achieves on-par or better performance +than AdamW with 45% to 50% less memory footprint. Adam-mini reduces memory by +cutting down the learning rate resources in Adam (i.e., $1/\sqrt{v}$). We find +that $\geq$ 90% of these learning rates in $v$ could be harmlessly removed if +we (1) carefully partition the parameters into blocks following our proposed +principle on Hessian structure; (2) assign a single but good learning rate to +each parameter block. We further find that, for each of these parameter blocks, +there exists a single high-quality learning rate that can outperform Adam, +provided that sufficient resources are available to search it out. We then +provide one cost-effective way to find good learning rates and propose +Adam-mini. Empirically, we verify that Adam-mini performs on par or better than +AdamW on various language models sized from 125M to 7B for pre-training, +supervised fine-tuning, and RLHF. The reduced memory footprint of Adam-mini +also alleviates communication overheads among GPUs and CPUs, thereby increasing +throughput. For instance, Adam-mini achieves 49.6% higher throughput than AdamW +when pre-training Llama2-7B on $2\times$ A800-80GB GPUs, which saves 33% +wall-clock time for pre-training. + +
+
+
+
+
+ + ♻ ☆ CaLMQA: Exploring culturally specific long-form question answering + across 23 languages + + +
+ Large language models (LLMs) are used for long-form question answering +(LFQA), which requires them to generate paragraph-length answers to complex +questions. While LFQA has been well-studied in English, this research has not +been extended to other languages. To bridge this gap, we introduce CaLMQA, a +collection of 1.5K complex culturally specific questions spanning 23 languages +and 51 culturally agnostic questions translated from English into 22 other +languages. We define culturally specific questions as those uniquely or more +likely to be asked by people from cultures associated with the question's +language. We collect naturally-occurring questions from community web forums +and hire native speakers to write questions to cover under-resourced, +rarely-studied languages such as Fijian and Kirundi. Our dataset contains +diverse, complex questions that reflect cultural topics (e.g. traditions, laws, +news) and the language usage of native speakers. We automatically evaluate a +suite of open- and closed-source models on CaLMQA by detecting incorrect +language and token repetitions in answers, and observe that the quality of +LLM-generated answers degrades significantly for some low-resource languages. +Lastly, we perform human evaluation on a subset of models and languages. Manual +evaluation reveals that model performance is significantly worse for culturally +specific questions than for culturally agnostic questions. Our findings +highlight the need for further research in non-English LFQA and provide an +evaluation framework. + +
+
+ comment: 39 pages, 17 figures. Code and data available at + https://github.com/2015aroras/CaLMQA. Revised argument in section 4, results + unchanged +
+
+
+
+
+ + ♻ ☆ Diffusion Models for Offline Multi-agent Reinforcement Learning with + Safety Constraints + + +
+ In recent advancements in Multi-agent Reinforcement Learning (MARL), its +application has extended to various safety-critical scenarios. However, most +methods focus on online learning, which presents substantial risks when +deployed in real-world settings. Addressing this challenge, we introduce an +innovative framework integrating diffusion models within the MARL paradigm. +This approach notably enhances the safety of actions taken by multiple agents +through risk mitigation while modeling coordinated action. Our framework is +grounded in the Centralized Training with Decentralized Execution (CTDE) +architecture, augmented by a Diffusion Model for prediction trajectory +generation. Additionally, we incorporate a specialized algorithm to further +ensure operational safety. We evaluate our model against baselines on the DSRL +benchmark. Experiment results demonstrate that our model not only adheres to +stringent safety constraints but also achieves superior performance compared to +existing methodologies. This underscores the potential of our approach in +advancing the safety and efficacy of MARL in real-world applications. + +
+
+
+
+
+ + ♻ ☆ Unveiling and Controlling Anomalous Attention Distribution in + Transformers + + +
+ With the advent of large models based on the Transformer architecture, +researchers have observed an anomalous phenomenon in the Attention +mechanism--there is a very high attention on the first element, which is +prevalent across Transformer-based models. It is crucial to understand it for +the development of techniques focusing on attention distribution, such as +Key-Value (KV) Cache compression and infinite extrapolation; however, the +latent cause leaves to be unknown. In this paper, we analyze such a phenomenon +from the perspective of waiver phenomenon, which involves reducing the internal +values of certain elements in the sequence, allowing them to absorb excess +attention without affecting their contribution to information. In specific +models, due to differences in positional encoding and attention patterns, we +have found that the selection of waiver elements by the model can be +categorized into two methods: positional-encoding-based and +feature-distribution-within-elements-based. + +
+
+
+
+
+ + ♻ ☆ YZS-model: A Predictive Model for Organic Drug Solubility Based on Graph + Convolutional Networks and Transformer-Attention + + +
+ The accurate prediction of drug molecule solubility is essential for +determining their therapeutic effectiveness and safety, influencing the drug's +ADME processes. Traditional solubility prediction techniques often fail to +capture the complex nature of molecular tructures, leading to notable +deviations between predictions and actual results. For example, the Discussion +on Advanced Drug-Like Compound Structures. Lusci highlighted issues in +capturing crucial cyclic structural information in molecules with ring +structures. To overcome this issue, our research introduces a novel deep +learning framework combining attention-based transformers, Long Short-Term +Memory (LSTM) networks, and Graph Convolutional Networks (GCN), aimed at +enhancing the precision of solubility predictions. Utilizing a training set of +9,943 compounds and testing on an anticancer compound dataset, our method +achieved a correlation coefficient ($R^2$) of 0.55 and a Root Mean Square Error +(RMSE) of 0.59, which outperforms the benchmark models' scores of 0.52 ($R^2$) +and 0.61 (RMSE). Importantly, in an additional independent test, our model +significantly outperformed the baseline with an RMSE of 1.05 compared to 1.28, +a relative accuracy improvement of 45.9%. This research not only demonstrates +the vast potential of deep learning for improving solubility prediction +accuracy but also offers novel insights for drug design and selection in the +future. Continued efforts will be directed towards optimizing the model +architecture and extending its application to better support the drug +development process, underscoring the pivotal role of deep learning in drug +discovery. + +
+
+ comment: 18 pages, 12 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Backstepping Neural Operators for $2\times 2$ Hyperbolic PDEs + + +
+ Deep neural network approximation of nonlinear operators, commonly referred +to as DeepONet, has proven capable of approximating PDE backstepping designs in +which a single Goursat-form PDE governs a single feedback gain function. In +boundary control of coupled PDEs, coupled Goursat-form PDEs govern two or more +gain kernels-a PDE structure unaddressed thus far with DeepONet. In this paper, +we explore the subject of approximating systems of gain kernel PDEs for +hyperbolic PDE plants by considering a simple counter-convecting $2\times 2$ +coupled system in whose control a $2\times 2$ kernel PDE system in Goursat form +arises. Engineering applications include oil drilling, the Saint-Venant model +of shallow water waves, and the Aw-Rascle-Zhang model of stop-and-go +instability in congested traffic flow. We establish the continuity of the +mapping from a total of five plant PDE functional coefficients to the kernel +PDE solutions, prove the existence of an arbitrarily close DeepONet +approximation to the kernel PDEs, and ensure that the DeepONet-approximated +gains guarantee stabilization when replacing the exact backstepping gain +kernels. Taking into account anti-collocated boundary actuation and sensing, +our $L^2$-Globally-exponentially stabilizing (GES) approximate gain +kernel-based output feedback design implies the deep learning of both the +controller's and the observer's gains. Moreover, the encoding of the +output-feedback law into DeepONet ensures semi-global practical exponential +stability (SG-PES). The DeepONet operator speeds up the computation of the +controller gains by multiple orders of magnitude. Its theoretically proven +stabilizing capability is demonstrated through simulations. + +
+
+
+
+
+ + ♻ ☆ VITS : Variational Inference Thompson Sampling for contextual bandits + + +
+ In this paper, we introduce and analyze a variant of the Thompson sampling +(TS) algorithm for contextual bandits. At each round, traditional TS requires +samples from the current posterior distribution, which is usually intractable. +To circumvent this issue, approximate inference techniques can be used and +provide samples with distribution close to the posteriors. However, current +approximate techniques yield to either poor estimation (Laplace approximation) +or can be computationally expensive (MCMC methods, Ensemble sampling...). In +this paper, we propose a new algorithm, Varational Inference Thompson sampling +VITS, based on Gaussian Variational Inference. This scheme provides powerful +posterior approximations which are easy to sample from, and is computationally +efficient, making it an ideal choice for TS. In addition, we show that VITS +achieves a sub-linear regret bound of the same order in the dimension and +number of round as traditional TS for linear contextual bandit. Finally, we +demonstrate experimentally the effectiveness of VITS on both synthetic and real +world datasets. + +
+
+
+
+
+ + ♻ ☆ Are demographically invariant models and representations in medical + imaging fair? + + +
+ Medical imaging models have been shown to encode information about patient +demographics such as age, race, and sex in their latent representation, raising +concerns about their potential for discrimination. Here, we ask whether +requiring models not to encode demographic attributes is desirable. We point +out that marginal and class-conditional representation invariance imply the +standard group fairness notions of demographic parity and equalized odds, +respectively. In addition, however, they require matching the risk +distributions, thus potentially equalizing away important group differences. +Enforcing the traditional fairness notions directly instead does not entail +these strong constraints. Moreover, representationally invariant models may +still take demographic attributes into account for deriving predictions, +implying unequal treatment - in fact, achieving representation invariance may +require doing so. In theory, this can be prevented using counterfactual notions +of (individual) fairness or invariance. We caution, however, that properly +defining medical image counterfactuals with respect to demographic attributes +is fraught with challenges. Finally, we posit that encoding demographic +attributes may even be advantageous if it enables learning a task-specific +encoding of demographic features that does not rely on social constructs such +as 'race' and 'gender.' We conclude that demographically invariant +representations are neither necessary nor sufficient for fairness in medical +imaging. Models may need to encode demographic attributes, lending further +urgency to calls for comprehensive model fairness assessments in terms of +predictive performance across diverse patient groups. + +
+
+
+
+
+ + ♻ ☆ Estimating Treatment Effects under Recommender Interference: A + Structured Neural Networks Approach + + +
+ Recommender systems are essential for content-sharing platforms by curating +personalized content. To evaluate updates to recommender systems targeting +content creators, platforms frequently rely on creator-side randomized +experiments. The treatment effect measures the change in outcomes when a new +algorithm is implemented compared to the status quo. We show that the standard +difference-in-means estimator can lead to biased estimates due to recommender +interference that arises when treated and control creators compete for +exposure. We propose a "recommender choice model" that describes which item +gets exposed from a pool containing both treated and control items. By +combining a structural choice model with neural networks, this framework +directly models the interference pathway while accounting for rich +viewer-content heterogeneity. We construct a debiased estimator of the +treatment effect and prove it is $\sqrt n$-consistent and asymptotically normal +with potentially correlated samples. We validate our estimator's empirical +performance with a field experiment on Weixin short-video platform. In addition +to the standard creator-side experiment, we conduct a costly double-sided +randomization design to obtain a benchmark estimate free from interference +bias. We show that the proposed estimator yields results comparable to the +benchmark, whereas the standard difference-in-means estimator can exhibit +significant bias and even produce reversed signs. + +
+
+
+
+
+ + ♻ ☆ Malign Overfitting: Interpolation Can Provably Preclude Invariance + + +
+ Learned classifiers should often possess certain invariance properties meant +to encourage fairness, robustness, or out-of-distribution generalization. +However, multiple recent works empirically demonstrate that common +invariance-inducing regularizers are ineffective in the over-parameterized +regime, in which classifiers perfectly fit (i.e. interpolate) the training +data. This suggests that the phenomenon of "benign overfitting", in which +models generalize well despite interpolating, might not favorably extend to +settings in which robustness or fairness are desirable. + In this work we provide a theoretical justification for these observations. +We prove that -- even in the simplest of settings -- any interpolating learning +rule (with arbitrarily small margin) will not satisfy these invariance +properties. We then propose and analyze an algorithm that -- in the same +setting -- successfully learns a non-interpolating classifier that is provably +invariant. We validate our theoretical observations on simulated data and the +Waterbirds dataset. + +
+
+
+
+
+ + ♻ ☆ Deconvolving Complex Neuronal Networks into Interpretable Task-Specific + Connectomes + + +
+ Task-specific functional MRI (fMRI) images provide excellent modalities for +studying the neuronal basis of cognitive processes. We use fMRI data to +formulate and solve the problem of deconvolving task-specific aggregate +neuronal networks into a set of basic building blocks called canonical +networks, to use these networks for functional characterization, and to +characterize the physiological basis of these responses by mapping them to +regions of the brain. Our results show excellent task-specificity of canonical +networks, i.e., the expression of a small number of canonical networks can be +used to accurately predict tasks; generalizability across cohorts, i.e., +canonical networks are conserved across diverse populations, studies, and +acquisition protocols; and that canonical networks have strong anatomical and +physiological basis. From a methods perspective, the problem of identifying +these canonical networks poses challenges rooted in the high dimensionality, +small sample size, acquisition variability, and noise. Our deconvolution +technique is based on non-negative matrix factorization (NMF) that identifies +canonical networks as factors of a suitably constructed matrix. We demonstrate +that our method scales to large datasets, yields stable and accurate factors, +and is robust to noise. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ A Computational Framework for Solving Wasserstein Lagrangian Flows + + +
+ The dynamical formulation of the optimal transport can be extended through +various choices of the underlying geometry (kinetic energy), and the +regularization of density paths (potential energy). These combinations yield +different variational problems (Lagrangians), encompassing many variations of +the optimal transport problem such as the Schr\"odinger bridge, unbalanced +optimal transport, and optimal transport with physical constraints, among +others. In general, the optimal density path is unknown, and solving these +variational problems can be computationally challenging. We propose a novel +deep learning based framework approaching all of these problems from a unified +perspective. Leveraging the dual formulation of the Lagrangians, our method +does not require simulating or backpropagating through the trajectories of the +learned dynamics, and does not need access to optimal couplings. We showcase +the versatility of the proposed framework by outperforming previous approaches +for the single-cell trajectory inference, where incorporating prior knowledge +into the dynamics is crucial for correct predictions. + +
+
+
+
+
+ + ♻ ☆ Multi-domain improves out-of-distribution and data-limited scenarios for + medical image analysis + + +
+ Current machine learning methods for medical image analysis primarily focus +on developing models tailored for their specific tasks, utilizing data within +their target domain. These specialized models tend to be data-hungry and often +exhibit limitations in generalizing to out-of-distribution samples. In this +work, we show that employing models that incorporate multiple domains instead +of specialized ones significantly alleviates the limitations observed in +specialized models. We refer to this approach as multi-domain model and compare +its performance to that of specialized models. For this, we introduce the +incorporation of diverse medical image domains, including different imaging +modalities like X-ray, MRI, CT, and ultrasound images, as well as various +viewpoints such as axial, coronal, and sagittal views. Our findings underscore +the superior generalization capabilities of multi-domain models, particularly +in scenarios characterized by limited data availability and +out-of-distribution, frequently encountered in healthcare applications. The +integration of diverse data allows multi-domain models to utilize information +across domains, enhancing the overall outcomes substantially. To illustrate, +for organ recognition, multi-domain model can enhance accuracy by up to 8% +compared to conventional specialized models. + +
+
+
+
+
+ + ♻ ☆ MG-Verilog: Multi-grained Dataset Towards Enhanced LLM-assisted Verilog + Generation + + +
+ Large Language Models (LLMs) have recently shown promise in streamlining +hardware design processes by encapsulating vast amounts of domain-specific +data. In addition, they allow users to interact with the design processes +through natural language instructions, thus making hardware design more +accessible to developers. However, effectively leveraging LLMs in hardware +design necessitates providing domain-specific data during inference (e.g., +through in-context learning), fine-tuning, or pre-training. Unfortunately, +existing publicly available hardware datasets are often limited in size, +complexity, or detail, which hinders the effectiveness of LLMs in hardware +design tasks. To address this issue, we first propose a set of criteria for +creating high-quality hardware datasets that can effectively enhance +LLM-assisted hardware design. Based on these criteria, we propose a +Multi-Grained-Verilog (MG-Verilog) dataset, which encompasses descriptions at +various levels of detail and corresponding code samples. To benefit the broader +hardware design community, we have developed an open-source infrastructure that +facilitates easy access, integration, and extension of the dataset to meet +specific project needs. Furthermore, to fully exploit the potential of the +MG-Verilog dataset, which varies in complexity and detail, we introduce a +balanced fine-tuning scheme. This scheme serves as a unique use case to +leverage the diverse levels of detail provided by the dataset. Extensive +experiments demonstrate that the proposed dataset and fine-tuning scheme +consistently improve the performance of LLMs in hardware design tasks. + +
+
+ comment: Accepted in ISLAD 2024 +
+
+
+
+
+ + ♻ ☆ Discovering Nuclear Models from Symbolic Machine Learning + + +
+ Numerous phenomenological nuclear models have been proposed to describe +specific observables within different regions of the nuclear chart. However, +developing a unified model that describes the complex behavior of all nuclei +remains an open challenge. Here, we explore whether novel symbolic Machine +Learning (ML) can rediscover traditional nuclear physics models or identify +alternatives with improved simplicity, fidelity, and predictive power. To +address this challenge, we developed a Multi-objective Iterated Symbolic +Regression approach that handles symbolic regressions over multiple target +observables, accounts for experimental uncertainties and is robust against +high-dimensional problems. As a proof of principle, we applied this method to +describe the nuclear binding energies and charge radii of light and medium mass +nuclei. Our approach identified simple analytical relationships based on the +number of protons and neutrons, providing interpretable models with precision +comparable to state-of-the-art nuclear models. Additionally, we integrated this +ML-discovered model with an existing complementary model to estimate the limits +of nuclear stability. These results highlight the potential of symbolic ML to +develop accurate nuclear models and guide our description of complex many-body +problems. + +
+
+
+
+
+ + ♻ ☆ LLMs can learn self-restraint through iterative self-reflection + + +
+ In order to be deployed safely, Large Language Models (LLMs) must be capable +of dynamically adapting their behavior based on their level of knowledge and +uncertainty associated with specific topics. This adaptive behavior, which we +refer to as self-restraint, is non-trivial to teach since it depends on the +internal knowledge of an LLM. By default, LLMs are trained to maximize the next +token likelihood, which does not teach the model to modulate its answer based +on its level of uncertainty. In order to learn self-restraint, we devise a +utility function that can encourage the model to produce responses only when it +is confident in them. This utility function can be used to score generation of +different length and abstention. To optimize this function, we introduce +ReSearch, a process of "self-reflection" consisting of iterative self-prompting +and self-evaluation. We use the ReSearch algorithm to generate synthetic data +on which we finetune our models. Compared to their original versions, our +resulting models generate fewer \emph{hallucinations} overall at no additional +inference cost, for both known and unknown topics, as the model learns to +selectively restrain itself. In addition, our method elegantly incorporates the +ability to abstain by augmenting the samples generated by the model during the +search procedure with an answer expressing abstention. + +
+
+
+
+
+ + ♻ ☆ Jamba: A Hybrid Transformer-Mamba Language Model + + +
+ We present Jamba, a new base large language model based on a novel hybrid +Transformer-Mamba mixture-of-experts (MoE) architecture. Specifically, Jamba +interleaves blocks of Transformer and Mamba layers, enjoying the benefits of +both model families. MoE is added in some of these layers to increase model +capacity while keeping active parameter usage manageable. This flexible +architecture allows resource- and objective-specific configurations. In the +particular configuration we have implemented, we end up with a powerful model +that fits in a single 80GB GPU. Built at large scale, Jamba provides high +throughput and small memory footprint compared to vanilla Transformers, and at +the same time state-of-the-art performance on standard language model +benchmarks and long-context evaluations. Remarkably, the model presents strong +results for up to 256K tokens context length. We study various architectural +decisions, such as how to combine Transformer and Mamba layers, and how to mix +experts, and show that some of them are crucial in large scale modeling. We +also describe several interesting properties of these architectures which the +training and evaluation of Jamba have revealed, and plan to release checkpoints +from various ablation runs, to encourage further exploration of this novel +architecture. We make the weights of our implementation of Jamba publicly +available under a permissive license. + +
+
+ comment: Webpage: https://www.ai21.com/jamba +
+
+
+
+
+ + ♻ ☆ Towards Efficient and Optimal Covariance-Adaptive Algorithms for + Combinatorial Semi-Bandits + + +
+ We address the problem of stochastic combinatorial semi-bandits, where a +player selects among $P$ actions from the power set of a set containing $d$ +base items. Adaptivity to the problem's structure is essential in order to +obtain optimal regret upper bounds. As estimating the coefficients of a +covariance matrix can be manageable in practice, leveraging them should improve +the regret. We design ``optimistic'' covariance-adaptive algorithms relying on +online estimations of the covariance structure, called OLSUCBC and COSV (only +the variances for the latter). They both yields improved gap-free regret. +Although COSV can be slightly suboptimal, it improves on computational +complexity by taking inspiration from Thompson Sampling approaches. It is the +first sampling-based algorithm satisfying a $\sqrt{T}$ gap-free regret (up to +poly-logs). We also show that in some cases, our approach efficiently leverages +the semi-bandit feedback and outperforms bandit feedback approaches, not only +in exponential regimes where $P\gg d$ but also when $P\leq d$, which is not +covered by existing analyses. + +
+
+
+
+
+ + ♻ ☆ Fredformer: Frequency Debiased Transformer for Time Series Forecasting KDD2024 + + +
+ The Transformer model has shown leading performance in time series +forecasting. Nevertheless, in some complex scenarios, it tends to learn +low-frequency features in the data and overlook high-frequency features, +showing a frequency bias. This bias prevents the model from accurately +capturing important high-frequency data features. In this paper, we undertook +empirical analyses to understand this bias and discovered that frequency bias +results from the model disproportionately focusing on frequency features with +higher energy. Based on our analysis, we formulate this bias and propose +Fredformer, a Transformer-based framework designed to mitigate frequency bias +by learning features equally across different frequency bands. This approach +prevents the model from overlooking lower amplitude features important for +accurate forecasting. Extensive experiments show the effectiveness of our +proposed approach, which can outperform other baselines in different real-world +time-series datasets. Furthermore, we introduce a lightweight variant of the +Fredformer with an attention matrix approximation, which achieves comparable +performance but with much fewer parameters and lower computation costs. The +code is available at: https://github.com/chenzRG/Fredformer + +
+
+ comment: This paper has been accepted by SIGKDD2024 +
+
+
+
+
+ + ♻ ☆ Mind the Privacy Unit! User-Level Differential Privacy for Language + Model Fine-Tuning + + +
+ Large language models (LLMs) have emerged as powerful tools for tackling +complex tasks across diverse domains, but they also raise privacy concerns when +fine-tuned on sensitive data due to potential memorization. While differential +privacy (DP) offers a promising solution by ensuring models are 'almost +indistinguishable' with or without any particular privacy unit, current +evaluations on LLMs mostly treat each example (text record) as the privacy +unit. This leads to uneven user privacy guarantees when contributions per user +vary. We therefore study user-level DP motivated by applications where it +necessary to ensure uniform privacy protection across users. We present a +systematic evaluation of user-level DP for LLM fine-tuning on natural language +generation tasks. Focusing on two mechanisms for achieving user-level DP +guarantees, Group Privacy and User-wise DP-SGD, we investigate design choices +like data selection strategies and parameter tuning for the best +privacy-utility tradeoff. + +
+
+
+
+
+ + ♻ ☆ Noise Contrastive Alignment of Language Models with Explicit Rewards + + +
+ User intentions are typically formalized as evaluation rewards to be +maximized when fine-tuning language models (LMs). Existing alignment methods, +such as Direct Preference Optimization (DPO), are mainly tailored for pairwise +preference data where rewards are implicitly defined rather than explicitly +given. In this paper, we introduce a general framework for LM alignment, +leveraging Noise Contrastive Estimation (NCE) to bridge the gap in handling +reward datasets explicitly annotated with scalar evaluations. Our framework +comprises two parallel algorithms, NCA and InfoNCA, both enabling the direct +extraction of an LM policy from reward data as well as preference data. +Notably, we show that the DPO loss is a special case of our proposed InfoNCA +objective under pairwise preference settings, thereby integrating and extending +current alignment theories. By comparing NCA and InfoNCA, we demonstrate that +the well-observed decreasing-likelihood trend of DPO/InfoNCA is caused by their +focus on adjusting relative likelihood across different responses. In contrast, +NCA optimizes the absolute likelihood for each response, thereby effectively +preventing the chosen likelihood from decreasing. We evaluate our methods in +both reward and preference settings with Mistral-8*7B and 7B models. +Experiments suggest that InfoNCA/NCA surpasses various preference baselines +when reward datasets are available. We also find NCA significantly outperforms +DPO in complex reasoning tasks like math and coding. + +
+
+
+
+
+ + ♻ ☆ Smaug: Fixing Failure Modes of Preference Optimisation with DPO-Positive + + +
+ Direct Preference Optimisation (DPO) is effective at significantly improving +the performance of large language models (LLMs) on downstream tasks such as +reasoning, summarisation, and alignment. Using pairs of preferred and +dispreferred data, DPO models the relative probability of picking one response +over another. In this work, first we show theoretically that the standard DPO +loss can lead to a reduction of the model's likelihood of the preferred +examples, as long as the relative probability between the preferred and +dispreferred classes increases. We then show empirically that this phenomenon +occurs when fine-tuning LLMs on common datasets, especially datasets in which +the edit distance between pairs of completions is low. Using these insights, we +design DPO-Positive (DPOP), a new loss function and training procedure which +avoids this failure mode. Surprisingly, we find that DPOP outperforms DPO and +other fine-tuning procedures across a wide variety of datasets and downstream +tasks, including datasets with high edit distances between completions. +Furthermore, we find that the DPOP-tuned model outperforms the DPO-tuned model +(all else equal) on benchmarks independent of the fine-tuning data, such as +MT-Bench. Finally, using DPOP, we create and open-source Smaug-34B and +Smaug-72B, with the latter becoming the first open-source LLM to surpass an +average accuracy of 80% on the HuggingFace Open LLM Leaderboard. + +
+
+
+
+
+ + ♻ ☆ NeuraLUT: Hiding Neural Network Density in Boolean Synthesizable + Functions + + +
+ Field-Programmable Gate Array (FPGA) accelerators have proven successful in +handling latency- and resource-critical deep neural network (DNN) inference +tasks. Among the most computationally intensive operations in a neural network +(NN) is the dot product between the feature and weight vectors. Thus, some +previous FPGA acceleration works have proposed mapping neurons with quantized +inputs and outputs directly to lookup tables (LUTs) for hardware +implementation. In these works, the boundaries of the neurons coincide with the +boundaries of the LUTs. We propose relaxing these boundaries and mapping entire +sub-networks to a single LUT. As the sub-networks are absorbed within the LUT, +the NN topology and precision within a partition do not affect the size of the +lookup tables generated. Therefore, we utilize fully connected layers with +floating-point precision inside each partition, which benefit from being +universal function approximators, but with rigid sparsity and quantization +enforced between partitions, where the NN topology becomes exposed to the +circuit topology. Although cheap to implement, this approach can lead to very +deep NNs, and so to tackle challenges like vanishing gradients, we also +introduce skip connections inside the partitions. The resulting methodology can +be seen as training DNNs with a specific FPGA hardware-inspired sparsity +pattern that allows them to be mapped to much shallower circuit-level networks, +thereby significantly improving latency. We validate our proposed method on a +known latency-critical task, jet substructure tagging, and on the classical +computer vision task, digit classification using MNIST. Our approach allows for +greater function expressivity within the LUTs compared to existing work, +leading to up to $4.3\times$ lower latency NNs for the same accuracy. + +
+
+
+
+
+ + ♻ ☆ UFRec: Integrating Uniformity and Frequency to Enhance Sequential + Recommendations + + +
+ Effective representation learning in sequential recommendation systems is +pivotal for precisely capturing user interaction patterns and enhancing +recommendation accuracy. Nonetheless, current methodologies largely focus on +item-to-item transitions, frequently overlooking the time intervals between +interactions, which are integral to understanding behavior pattern shifts. +Moreover, critical interaction attributes like item frequency are often +neglected. Our research indicates that sequences with more consistent time +intervals and items with higher interaction frequency result in superior +predictive performance. In contrast, sequences with non-uniform intervals +contribute to user interest drift, and infrequently interacted items are +challenging to model due to sparse data, posing unique challenges that existing +methods fail to adequately address. In this study, we introduce UFRec, an +innovative bidirectional enhancement method for sequential recommendations. +UFRec harnesses sequence uniformity and item frequency to boost performance, +particularly improving the representation of non-uniform sequences and +less-frequent items. These two components synergistically enhance each other, +driving holistic performance optimization in intricate sequential +recommendation scenarios. Additionally, we introduce a multidimensional time +module to further augment adaptability. To the best of our knowledge, UFRec is +the pioneering method to exploit the properties of uniformity and frequency for +feature augmentation. Through comparisons with eleven state-of-the-art models +across four datasets, we demonstrate that UFRec significantly surpasses current +leading models. + +
+
+ comment: 15 pages, 8 figures, for source code, see + https://github.com/Linxi000/UniRec +
+
+
+
+
+ + ♻ ☆ PWM: Policy Learning with Large World Models + + +
+ Reinforcement Learning (RL) has achieved impressive results on complex tasks +but struggles in multi-task settings with different embodiments. World models +offer scalability by learning a simulation of the environment, yet they often +rely on inefficient gradient-free optimization methods. We introduce Policy +learning with large World Models (PWM), a novel model-based RL algorithm that +learns continuous control policies from large multi-task world models. By +pre-training the world model on offline data and using it for first-order +gradient policy learning, PWM effectively solves tasks with up to 152 action +dimensions and outperforms methods using ground-truth dynamics. Additionally, +PWM scales to an 80-task setting, achieving up to 27% higher rewards than +existing baselines without the need for expensive online planning. +Visualizations and code available at https://www.imgeorgiev.com/pwm + +
+
+ comment: Visualizations and code available at https://www.imgeorgiev.com/pwm +
+
+
+
+
+ + ♻ ☆ Explanations Based on Item Response Theory (eXirt): A Model-Specific + Method to Explain Tree-Ensemble Model in Trust Perspective + + +
+ In recent years, XAI researchers have been formalizing proposals and +developing new methods to explain black box models, with no general consensus +in the community on which method to use to explain these models, with this +choice being almost directly linked to the popularity of a specific method. +Methods such as Ciu, Dalex, Eli5, Lofo, Shap and Skater emerged with the +proposal to explain black box models through global rankings of feature +relevance, which based on different methodologies, generate global explanations +that indicate how the model's inputs explain its predictions. In this context, +41 datasets, 4 tree-ensemble algorithms (Light Gradient Boosting, CatBoost, +Random Forest, and Gradient Boosting), and 6 XAI methods were used to support +the launch of a new XAI method, called eXirt, based on Item Response Theory - +IRT and aimed at tree-ensemble black box models that use tabular data referring +to binary classification problems. In the first set of analyses, the 164 global +feature relevance ranks of the eXirt were compared with 984 ranks of the other +XAI methods present in the literature, seeking to highlight their similarities +and differences. In a second analysis, exclusive explanations of the eXirt +based on Explanation-by-example were presented that help in understanding the +model trust. Thus, it was verified that eXirt is able to generate global +explanations of tree-ensemble models and also local explanations of instances +of models through IRT, showing how this consolidated theory can be used in +machine learning in order to obtain explainable and reliable models. + +
+
+ comment: 59 pages, 16 Figures, 3 Equations, 6 table +
+
+
+
+
+ + ♻ ☆ ArchesWeather: An efficient AI weather forecasting model at 1.5° + resolution ICML 2024 + + +
+ One of the guiding principles for designing AI-based weather forecasting +systems is to embed physical constraints as inductive priors in the neural +network architecture. A popular prior is locality, where the atmospheric data +is processed with local neural interactions, like 3D convolutions or 3D local +attention windows as in Pangu-Weather. On the other hand, some works have shown +great success in weather forecasting without this locality principle, at the +cost of a much higher parameter count. In this paper, we show that the 3D local +processing in Pangu-Weather is computationally sub-optimal. We design +ArchesWeather, a transformer model that combines 2D attention with a +column-wise attention-based feature interaction module, and demonstrate that +this design improves forecasting skill. + ArchesWeather is trained at 1.5{\deg} resolution and 24h lead time, with a +training budget of a few GPU-days and a lower inference cost than competing +methods. An ensemble of four of our models shows better RMSE scores than the +IFS HRES and is competitive with the 1.4{\deg} 50-members NeuralGCM ensemble +for one to three days ahead forecasting. Our code and models are publicly +available at https://github.com/gcouairon/ArchesWeather. + +
+
+ comment: Accepted at the Machine Learning for Earth System Modeling Workshop + at ICML 2024 +
+
+
+
+
+ + ♻ ☆ Thermodynamics-informed super-resolution of scarce temporal dynamics + data + + +
+ We present a method to increase the resolution of measurements of a physical +system and subsequently predict its time evolution using thermodynamics-aware +neural networks. Our method uses adversarial autoencoders, which reduce the +dimensionality of the full order model to a set of latent variables that are +enforced to match a prior, for example a normal distribution. Adversarial +autoencoders are seen as generative models, and they can be trained to generate +high-resolution samples from low-resoution inputs, meaning they can address the +so-called super-resolution problem. Then, a second neural network is trained to +learn the physical structure of the latent variables and predict their temporal +evolution. This neural network is known as an structure-preserving neural +network. It learns the metriplectic-structure of the system and applies a +physical bias to ensure that the first and second principles of thermodynamics +are fulfilled. The integrated trajectories are decoded to their original +dimensionality, as well as to the higher dimensionality space produced by the +adversarial autoencoder and they are compared to the ground truth solution. The +method is tested with two examples of flow over a cylinder, where the fluid +properties are varied between both examples. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Semantic Segmentation via Marginal Contextual + Information + + +
+ We present a novel confidence refinement scheme that enhances pseudo labels +in semi-supervised semantic segmentation. Unlike existing methods, which filter +pixels with low-confidence predictions in isolation, our approach leverages the +spatial correlation of labels in segmentation maps by grouping neighboring +pixels and considering their pseudo labels collectively. With this contextual +information, our method, named S4MC, increases the amount of unlabeled data +used during training while maintaining the quality of the pseudo labels, all +with negligible computational overhead. Through extensive experiments on +standard benchmarks, we demonstrate that S4MC outperforms existing +state-of-the-art semi-supervised learning approaches, offering a promising +solution for reducing the cost of acquiring dense annotations. For example, +S4MC achieves a 1.39 mIoU improvement over the prior art on PASCAL VOC 12 with +366 annotated images. The code to reproduce our experiments is available at +https://s4mcontext.github.io/ + +
+
+ comment: Published at TMLR +
+
+
+
+
+ + ♻ ☆ Online Variational Sequential Monte Carlo + + +
+ Being the most classical generative model for serial data, state-space models +(SSM) are fundamental in AI and statistical machine learning. In SSM, any form +of parameter learning or latent state inference typically involves the +computation of complex latent-state posteriors. In this work, we build upon the +variational sequential Monte Carlo (VSMC) method, which provides +computationally efficient and accurate model parameter estimation and Bayesian +latent-state inference by combining particle methods and variational inference. +While standard VSMC operates in the offline mode, by re-processing repeatedly a +given batch of data, we distribute the approximation of the gradient of the +VSMC surrogate ELBO in time using stochastic approximation, allowing for online +learning in the presence of streams of data. This results in an algorithm, +online VSMC, that is capable of performing efficiently, entirely on-the-fly, +both parameter estimation and particle proposal adaptation. In addition, we +provide rigorous theoretical results describing the algorithm's convergence +properties as the number of data tends to infinity as well as numerical +illustrations of its excellent convergence properties and usefulness also in +batch-processing settings. + +
+
+ comment: In this version there are better explanatory figures for the + simulations in Section 5, and some text improvements/typos fixed +
+
+
+
+
+ + ♻ ☆ Spectral Estimators for Structured Generalized Linear Models via + Approximate Message Passing + + +
+ We consider the problem of parameter estimation in a high-dimensional +generalized linear model. Spectral methods obtained via the principal +eigenvector of a suitable data-dependent matrix provide a simple yet +surprisingly effective solution. However, despite their wide use, a rigorous +performance characterization, as well as a principled way to preprocess the +data, are available only for unstructured (i.i.d.\ Gaussian and Haar +orthogonal) designs. In contrast, real-world data matrices are highly +structured and exhibit non-trivial correlations. To address the problem, we +consider correlated Gaussian designs capturing the anisotropic nature of the +features via a covariance matrix $\Sigma$. Our main result is a precise +asymptotic characterization of the performance of spectral estimators. This +allows us to identify the optimal preprocessing that minimizes the number of +samples needed for parameter estimation. Surprisingly, such preprocessing is +universal across a broad set of designs, which partly addresses a conjecture on +optimal spectral estimators for rotationally invariant models. Our principled +approach vastly improves upon previous heuristic methods, including for designs +common in computational imaging and genetics. The proposed methodology, based +on approximate message passing, is broadly applicable and opens the way to the +precise characterization of spiked matrices and of the corresponding spectral +methods in a variety of settings. + +
+
+
+
+
+ + ♻ ☆ When Benchmarks are Targets: Revealing the Sensitivity of Large Language + Model Leaderboards ACL 2024 + + +
+ Large Language Model (LLM) leaderboards based on benchmark rankings are +regularly used to guide practitioners in model selection. Often, the published +leaderboard rankings are taken at face value - we show this is a (potentially +costly) mistake. Under existing leaderboards, the relative performance of LLMs +is highly sensitive to (often minute) details. We show that for popular +multiple-choice question benchmarks (e.g., MMLU), minor perturbations to the +benchmark, such as changing the order of choices or the method of answer +selection, result in changes in rankings up to 8 positions. We explain this +phenomenon by conducting systematic experiments over three broad categories of +benchmark perturbations and identifying the sources of this behavior. Our +analysis results in several best-practice recommendations, including the +advantage of a hybrid scoring method for answer selection. Our study highlights +the dangers of relying on simple benchmark evaluations and charts the path for +more robust evaluation schemes on the existing benchmarks. The code for this +paper is available at +https://github.com/National-Center-for-AI-Saudi-Arabia/lm-evaluation-harness. + +
+
+ comment: updated with ACL 2024 camera ready version +
+
+
+
+
+ + ♻ ☆ Meta-Learning Based Optimization for Large Scale Wireless Systems + + +
+ Optimization algorithms for wireless systems play a fundamental role in +improving their performance and efficiency. However, it is known that the +complexity of conventional optimization algorithms in the literature often +exponentially increases with the number of transmit antennas and communication +users in the wireless system. Therefore, in the large scale regime, the +astronomically large complexity of these optimization algorithms prohibits +their use and prevents assessing large scale wireless systems performance under +optimized conditions. To overcome this limitation, this work proposes instead +the use of an unsupervised meta-learning based approach to directly perform +non-convex optimization at significantly reduced complexity. To demonstrate the +effectiveness of the proposed meta-learning based solution, the sum-rate (SR) +maximization problem for the following three emerging 6G technologies is +contemplated: hierarchical rate-splitting multiple access (H-RSMA), integrated +sensing and communication (ISAC), and beyond-diagonal reconfigurable +intelligent surfaces (BD-RIS). Through numerical results, it is demonstrated +that the proposed meta-learning based optimization framework is able to +successfully optimize the performance and also reveal unknown aspects of the +operation in the large scale regime for the considered three 6G technologies. + +
+
+
+
+
+ + ♻ ☆ Mixture-of-Experts for Open Set Domain Adaptation: A Dual-Space + Detection Approach + + +
+ Open Set Domain Adaptation (OSDA) aims to cope with the distribution and +label shifts between the source and target domains simultaneously, performing +accurate classification for known classes while identifying unknown class +samples in the target domain. Most existing OSDA approaches, depending on the +final image feature space of deep models, require manually-tuned thresholds, +and may easily misclassify unknown samples as known classes. Mixture-of-Experts +(MoE) could be a remedy. Within a MoE, different experts handle distinct input +features, producing unique expert routing patterns for various classes in a +routing feature space. As a result, unknown class samples may display different +expert routing patterns to known classes. In this paper, we propose Dual-Space +Detection, which exploits the inconsistencies between the image feature space +and the routing feature space to detect unknown class samples without any +threshold. Graph Router is further introduced to better make use of the spatial +information among image patches. Experiments on three different datasets +validated the effectiveness and superiority of our approach. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Explainable AI for Comparative Analysis of Intrusion Detection Models + + +
+ Explainable Artificial Intelligence (XAI) has become a widely discussed +topic, the related technologies facilitate better understanding of conventional +black-box models like Random Forest, Neural Networks and etc. However, +domain-specific applications of XAI are still insufficient. To fill this gap, +this research analyzes various machine learning models to the tasks of binary +and multi-class classification for intrusion detection from network traffic on +the same dataset using occlusion sensitivity. The models evaluated include +Linear Regression, Logistic Regression, Linear Support Vector Machine (SVM), +K-Nearest Neighbors (KNN), Random Forest, Decision Trees, and Multi-Layer +Perceptrons (MLP). We trained all models to the accuracy of 90\% on the +UNSW-NB15 Dataset. We found that most classifiers leverage only less than three +critical features to achieve such accuracies, indicating that effective feature +engineering could actually be far more important for intrusion detection than +applying complicated models. We also discover that Random Forest provides the +best performance in terms of accuracy, time efficiency and robustness. Data and +code available at https://github.com/pcwhy/XML-IntrusionDetection.git + +
+
+ comment: Submitted to IEEE MeditCom 2024 - WS-05 +
+
+
+
+
+ + ♻ ☆ ChatGPT Code Detection: Techniques for Uncovering the Source of Code + + +
+ In recent times, large language models (LLMs) have made significant strides +in generating computer code, blurring the lines between code created by humans +and code produced by artificial intelligence (AI). As these technologies evolve +rapidly, it is crucial to explore how they influence code generation, +especially given the risk of misuse in areas like higher education. This paper +explores this issue by using advanced classification techniques to +differentiate between code written by humans and that generated by ChatGPT, a +type of LLM. We employ a new approach that combines powerful embedding features +(black-box) with supervised learning algorithms - including Deep Neural +Networks, Random Forests, and Extreme Gradient Boosting - to achieve this +differentiation with an impressive accuracy of 98%. For the successful +combinations, we also examine their model calibration, showing that some of the +models are extremely well calibrated. Additionally, we present white-box +features and an interpretable Bayes classifier to elucidate critical +differences between the code sources, enhancing the explainability and +transparency of our approach. Both approaches work well but provide at most +85-88% accuracy. We also show that untrained humans solve the same task not +better than random guessing. This study is crucial in understanding and +mitigating the potential risks associated with using AI in code generation, +particularly in the context of higher education, software development, and +competitive programming. + +
+
+ comment: Accepted for publication in MDPI AI Journal +
+
+
+
+
+ + ♻ ☆ Federated Continual Learning Goes Online: Leveraging Uncertainty for + Modality-Agnostic Class-Incremental Learning + + +
+ Given the ability to model more realistic and dynamic problems, Federated +Continual Learning (FCL) has been increasingly investigated recently. A +well-known problem encountered in this setting is the so-called catastrophic +forgetting, for which the learning model is inclined to focus on more recent +tasks while forgetting the previously learned knowledge. The majority of the +current approaches in FCL propose generative-based solutions to solve said +problem. However, this setting requires multiple training epochs over the data, +implying an offline setting where datasets are stored locally and remain +unchanged over time. Furthermore, the proposed solutions are tailored for +vision tasks solely. To overcome these limitations, we propose a new +modality-agnostic approach to deal with the online scenario where new data +arrive in streams of mini-batches that can only be processed once. To solve +catastrophic forgetting, we propose an uncertainty-aware memory-based approach. +In particular, we suggest using an estimator based on the Bregman Information +(BI) to compute the model's variance at the sample level. Through measures of +predictive uncertainty, we retrieve samples with specific characteristics, and +- by retraining the model on such samples - we demonstrate the potential of +this approach to reduce the forgetting effect in realistic settings. + +
+
+
+
+
+ + ♻ ☆ ENOT: Expectile Regularization for Fast and Accurate Training of Neural + Optimal Transport + + +
+ We present a new approach for Neural Optimal Transport (NOT) training +procedure, capable of accurately and efficiently estimating optimal +transportation plan via specific regularization on dual Kantorovich potentials. +The main bottleneck of existing NOT solvers is associated with the procedure of +finding a near-exact approximation of the conjugate operator (i.e., the +c-transform), which is done either by optimizing over non-convex max-min +objectives or by the computationally intensive fine-tuning of the initial +approximated prediction. We resolve both issues by proposing a new, +theoretically justified loss in the form of expectile regularisation which +enforces binding conditions on the learning process of dual potentials. Such a +regularization provides the upper bound estimation over the distribution of +possible conjugate potentials and makes the learning stable, completely +eliminating the need for additional extensive fine-tuning. Proposed method, +called Expectile-Regularised Neural Optimal Transport (ENOT), outperforms +previous state-of-the-art approaches on the established Wasserstein-2 benchmark +tasks by a large margin (up to a 3-fold improvement in quality and up to a +10-fold improvement in runtime). Moreover, we showcase performance of ENOT for +varying cost functions on different tasks such as image generation, showing +robustness of proposed algorithm. OTT-JAX library includes our implementation +of ENOT algorithm https://ott-jax.readthedocs.io/en/latest/tutorials/ENOT.html + +
+
+
+
+
+ + ♻ ☆ MLEM: Generative and Contrastive Learning as Distinct Modalities for + Event Sequences + + +
+ This study explores the application of self-supervised learning techniques +for event sequences. It is a key modality in various applications such as +banking, e-commerce, and healthcare. However, there is limited research on +self-supervised learning for event sequences, and methods from other domains +like images, texts, and speech may not easily transfer. To determine the most +suitable approach, we conduct a detailed comparative analysis of previously +identified best-performing methods. We find that neither the contrastive nor +generative method is superior. Our assessment includes classifying event +sequences, predicting the next event, and evaluating embedding quality. These +results further highlight the potential benefits of combining both methods. +Given the lack of research on hybrid models in this domain, we initially adapt +the baseline model from another domain. However, upon observing its +underperformance, we develop a novel method called the Multimodal-Learning +Event Model (MLEM). MLEM treats contrastive learning and generative modeling as +distinct yet complementary modalities, aligning their embeddings. The results +of our study demonstrate that combining contrastive and generative approaches +into one procedure with MLEM achieves superior performance across multiple +metrics. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Multi-Agent Probabilistic Ensembles with Trajectory Sampling for + Connected Autonomous Vehicles + + +
+ Autonomous Vehicles (AVs) have attracted significant attention in recent +years and Reinforcement Learning (RL) has shown remarkable performance in +improving the autonomy of vehicles. In that regard, the widely adopted +Model-Free RL (MFRL) promises to solve decision-making tasks in connected AVs +(CAVs), contingent on the readiness of a significant amount of data samples for +training. Nevertheless, it might be infeasible in practice and possibly lead to +learning instability. In contrast, Model-Based RL (MBRL) manifests itself in +sample-efficient learning, but the asymptotic performance of MBRL might lag +behind the state-of-the-art MFRL algorithms. Furthermore, most studies for CAVs +are limited to the decision-making of a single AV only, thus underscoring the +performance due to the absence of communications. In this study, we try to +address the decision-making problem of multiple CAVs with limited +communications and propose a decentralized Multi-Agent Probabilistic Ensembles +with Trajectory Sampling algorithm MA-PETS. In particular, in order to better +capture the uncertainty of the unknown environment, MA-PETS leverages +Probabilistic Ensemble (PE) neural networks to learn from communicated samples +among neighboring CAVs. Afterwards, MA-PETS capably develops Trajectory +Sampling (TS)-based model-predictive control for decision-making. On this +basis, we derive the multi-agent group regret bound affected by the number of +agents within the communication range and mathematically validate that +incorporating effective information exchange among agents into the multi-agent +learning scheme contributes to reducing the group regret bound in the worst +case. Finally, we empirically demonstrate the superiority of MA-PETS in terms +of the sample efficiency comparable to MFBL. + +
+
+
+
+
+ + ♻ ☆ Explainable AI for Safe and Trustworthy Autonomous Driving: A Systematic + Review + + +
+ Artificial Intelligence (AI) shows promising applications for the perception +and planning tasks in autonomous driving (AD) due to its superior performance +compared to conventional methods. However, inscrutable AI systems exacerbate +the existing challenge of safety assurance of AD. One way to mitigate this +challenge is to utilize explainable AI (XAI) techniques. To this end, we +present the first comprehensive systematic literature review of explainable +methods for safe and trustworthy AD. We begin by analyzing the requirements for +AI in the context of AD, focusing on three key aspects: data, model, and +agency. We find that XAI is fundamental to meeting these requirements. Based on +this, we explain the sources of explanations in AI and describe a taxonomy of +XAI. We then identify five key contributions of XAI for safe and trustworthy AI +in AD, which are interpretable design, interpretable surrogate models, +interpretable monitoring, auxiliary explanations, and interpretable validation. +Finally, we propose a modular framework called SafeX to integrate these +contributions, enabling explanation delivery to users while simultaneously +ensuring the safety of AI models. + +
+
+
+
+
+ + ♻ ☆ Expressivity of Graph Neural Networks Through the Lens of Adversarial + Robustness ICML + + +
+ We perform the first adversarial robustness study into Graph Neural Networks +(GNNs) that are provably more powerful than traditional Message Passing Neural +Networks (MPNNs). In particular, we use adversarial robustness as a tool to +uncover a significant gap between their theoretically possible and empirically +achieved expressive power. To do so, we focus on the ability of GNNs to count +specific subgraph patterns, which is an established measure of expressivity, +and extend the concept of adversarial robustness to this task. Based on this, +we develop efficient adversarial attacks for subgraph counting and show that +more powerful GNNs fail to generalize even to small perturbations to the +graph's structure. Expanding on this, we show that such architectures also fail +to count substructures on out-of-distribution graphs. + +
+
+ comment: Published in ${2}^{nd}$ AdvML Frontiers workshop at ${40}^{th}$ + International Conference on Machine Learning (ICML) +
+
+
+
+
+ + ♻ ☆ Nuisances via Negativa: Adjusting for Spurious Correlations via Data + Augmentation + + +
+ In prediction tasks, there exist features that are related to the label in +the same way across different settings for that task; these are semantic +features or semantics. Features with varying relationships to the label are +nuisances. For example, in detecting cows from natural images, the shape of the +head is semantic but because images of cows often have grass backgrounds but +not always, the background is a nuisance. Models that exploit nuisance-label +relationships face performance degradation when these relationships change. +Building models robust to such changes requires additional knowledge beyond +samples of the features and labels. For example, existing work uses annotations +of nuisances or assumes ERM-trained models depend on nuisances. Approaches to +integrate new kinds of additional knowledge enlarge the settings where robust +models can be built. We develop an approach to use knowledge about the +semantics by corrupting them in data, and then using the corrupted data to +produce models which identify correlations between nuisances and the label. +Once these correlations are identified, they can be used to adjust for where +nuisances drive predictions. We study semantic corruptions in powering +different spurious-correlation avoiding methods on multiple out-of-distribution +(OOD) tasks like classifying waterbirds, natural language inference (NLI), and +detecting cardiomegaly in chest X-rays. + +
+
+
+
+
+ + ♻ ☆ RIDGE: Reproducibility, Integrity, Dependability, Generalizability, and + Efficiency Assessment of Medical Image Segmentation Models + + +
+ Deep learning techniques hold immense promise for advancing medical image +analysis, particularly in tasks like image segmentation, where precise +annotation of regions or volumes of interest within medical images is crucial +but manually laborious and prone to interobserver and intraobserver biases. As +such, deep learning approaches could provide automated solutions for such +applications. However, the potential of these techniques is often undermined by +challenges in reproducibility and generalizability, which are key barriers to +their clinical adoption. This paper introduces the RIDGE checklist, a +comprehensive framework designed to assess the Reproducibility, Integrity, +Dependability, Generalizability, and Efficiency of deep learning-based medical +image segmentation models. The RIDGE checklist is not just a tool for +evaluation but also a guideline for researchers striving to improve the quality +and transparency of their work. By adhering to the principles outlined in the +RIDGE checklist, researchers can ensure that their developed segmentation +models are robust, scientifically valid, and applicable in a clinical setting. + +
+
+ comment: 24 pages, 1 Figure, 2 Table +
+
+
+
+
+ + ♻ ☆ How Deep Neural Networks Learn Compositional Data: The Random Hierarchy + Model + + +
+ Deep learning algorithms demonstrate a surprising ability to learn +high-dimensional tasks from limited examples. This is commonly attributed to +the depth of neural networks, enabling them to build a hierarchy of abstract, +low-dimensional data representations. However, how many training examples are +required to learn such representations remains unknown. To quantitatively study +this question, we introduce the Random Hierarchy Model: a family of synthetic +tasks inspired by the hierarchical structure of language and images. The model +is a classification task where each class corresponds to a group of high-level +features, chosen among several equivalent groups associated with the same +class. In turn, each feature corresponds to a group of sub-features chosen +among several equivalent ones and so on, following a hierarchy of composition +rules. We find that deep networks learn the task by developing internal +representations invariant to exchanging equivalent groups. Moreover, the number +of data required corresponds to the point where correlations between low-level +features and classes become detectable. Overall, our results indicate how deep +networks overcome the curse of dimensionality by building invariant +representations, and provide an estimate of the number of data required to +learn a hierarchical task. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ A Systematic Performance Analysis of Deep Perceptual Loss Networks: + Breaking Transfer Learning Conventions + + +
+ In recent years, deep perceptual loss has been widely and successfully used +to train machine learning models for many computer vision tasks, including +image synthesis, segmentation, and autoencoding. Deep perceptual loss is a type +of loss function for images that computes the error between two images as the +distance between deep features extracted from a neural network. Most +applications of the loss use pretrained networks called loss networks for deep +feature extraction. However, despite increasingly widespread use, the effects +of loss network implementation on the trained models have not been studied. + This work rectifies this through a systematic evaluation of the effect of +different pretrained loss networks on four different application areas. +Specifically, the work evaluates 14 different pretrained architectures with +four different feature extraction layers. The evaluation reveals that VGG +networks without batch normalization have the best performance and that the +choice of feature extraction layer is at least as important as the choice of +architecture. The analysis also reveals that deep perceptual loss does not +adhere to the transfer learning conventions that better ImageNet accuracy +implies better downstream performance and that feature extraction from the +later layers provides better performance. + +
+
+
+
+
+ + ♻ ☆ A primer on synthetic health data + + +
+ Recent advances in deep generative models have greatly expanded the potential +to create realistic synthetic health datasets. These synthetic datasets aim to +preserve the characteristics, patterns, and overall scientific conclusions +derived from sensitive health datasets without disclosing patient identity or +sensitive information. Thus, synthetic data can facilitate safe data sharing +that supports a range of initiatives including the development of new +predictive models, advanced health IT platforms, and general project ideation +and hypothesis development. However, many questions and challenges remain, +including how to consistently evaluate a synthetic dataset's similarity and +predictive utility in comparison to the original real dataset and risk to +privacy when shared. Additional regulatory and governance issues have not been +widely addressed. In this primer, we map the state of synthetic health data, +including generation and evaluation methods and tools, existing examples of +deployment, the regulatory and ethical landscape, access and governance +options, and opportunities for further development. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Detection of Perfect and Partial Input-Dependent + Symmetries + + +
+ Group equivariance can overly constrain models if the symmetries in the group +differ from those observed in data. While common methods address this by +determining the appropriate level of symmetry at the dataset level, they are +limited to supervised settings and ignore scenarios in which multiple levels of +symmetry co-exist in the same dataset. In this paper, we propose a method able +to detect the level of symmetry of each input without the need for labels. Our +framework is general enough to accommodate different families of both +continuous and discrete symmetry distributions, such as arbitrary unimodal, +symmetric distributions and discrete groups. We validate the effectiveness of +our approach on synthetic datasets with different per-class levels of +symmetries, and demonstrate practical applications such as the detection of +out-of-distribution symmetries. Our code is publicly available at +https://github.com/aurban0/ssl-sym. + +
+
+
+
+
+ + ♻ ☆ Fairness-aware Federated Minimax Optimization with Convergence Guarantee + + +
+ Federated learning (FL) has garnered considerable attention due to its +privacy-preserving feature. Nonetheless, the lack of freedom in managing user +data can lead to group fairness issues, where models are biased towards +sensitive factors such as race or gender. To tackle this issue, this paper +proposes a novel algorithm, fair federated averaging with augmented Lagrangian +method (FFALM), designed explicitly to address group fairness issues in FL. +Specifically, we impose a fairness constraint on the training objective and +solve the minimax reformulation of the constrained optimization problem. Then, +we derive the theoretical upper bound for the convergence rate of FFALM. The +effectiveness of FFALM in improving fairness is shown empirically on CelebA and +UTKFace datasets in the presence of severe statistical heterogeneity. + +
+
+
+
+
+ + ♻ ☆ Meerkat: Audio-Visual Large Language Model for Grounding in Space and + Time ECCV 2024 + + +
+ Leveraging Large Language Models' remarkable proficiency in text-based tasks, +recent works on Multi-modal LLMs (MLLMs) extend them to other modalities like +vision and audio. However, the progress in these directions has been mostly +focused on tasks that only require a coarse-grained understanding of the +audio-visual semantics. We present Meerkat, an audio-visual LLM equipped with a +fine-grained understanding of image and audio both spatially and temporally. +With a new modality alignment module based on optimal transport and a +cross-attention module that enforces audio-visual consistency, Meerkat can +tackle challenging tasks such as audio referred image grounding, image guided +audio temporal localization, and audio-visual fact-checking. Moreover, we +carefully curate a large dataset AVFIT that comprises 3M instruction tuning +samples collected from open-source datasets, and introduce MeerkatBench that +unifies five challenging audio-visual tasks. We achieve state-of-the-art +performance on all these downstream tasks with a relative improvement of up to +37.12%. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ ULLER: A Unified Language for Learning and Reasoning + + +
+ The field of neuro-symbolic artificial intelligence (NeSy), which combines +learning and reasoning, has recently experienced significant growth. There now +are a wide variety of NeSy frameworks, each with its own specific language for +expressing background knowledge and how to relate it to neural networks. This +heterogeneity hinders accessibility for newcomers and makes comparing different +NeSy frameworks challenging. We propose a unified language for NeSy, which we +call ULLER, a Unified Language for LEarning and Reasoning. ULLER encompasses a +wide variety of settings, while ensuring that knowledge described in it can be +used in existing NeSy systems. ULLER has a neuro-symbolic first-order syntax +for which we provide example semantics including classical, fuzzy, and +probabilistic logics. We believe ULLER is a first step towards making NeSy +research more accessible and comparable, paving the way for libraries that +streamline training and evaluation across a multitude of semantics, knowledge +bases, and NeSy systems. + +
+
+ comment: Pre-review version. Final version accepted at NeSy 2024 +
+
+
+
+
+ + ♻ ☆ Accelerating Diffusion Sampling with Optimized Time Steps CVPR 2024 + + +
+ Diffusion probabilistic models (DPMs) have shown remarkable performance in +high-resolution image synthesis, but their sampling efficiency is still to be +desired due to the typically large number of sampling steps. Recent +advancements in high-order numerical ODE solvers for DPMs have enabled the +generation of high-quality images with much fewer sampling steps. While this is +a significant development, most sampling methods still employ uniform time +steps, which is not optimal when using a small number of steps. To address this +issue, we propose a general framework for designing an optimization problem +that seeks more appropriate time steps for a specific numerical ODE solver for +DPMs. This optimization problem aims to minimize the distance between the +ground-truth solution to the ODE and an approximate solution corresponding to +the numerical solver. It can be efficiently solved using the constrained trust +region method, taking less than $15$ seconds. Our extensive experiments on both +unconditional and conditional sampling using pixel- and latent-space DPMs +demonstrate that, when combined with the state-of-the-art sampling method +UniPC, our optimized time steps significantly improve image generation +performance in terms of FID scores for datasets such as CIFAR-10 and ImageNet, +compared to using uniform time steps. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Learning and Forgetting Unsafe Examples in Large Language Models ICML 24 + + +
+ As the number of large language models (LLMs) released to the public grows, +there is a pressing need to understand the safety implications associated with +these models learning from third-party custom finetuning data. We explore the +behavior of LLMs finetuned on noisy custom data containing unsafe content, +represented by datasets that contain biases, toxicity, and harmfulness, finding +that while aligned LLMs can readily learn this unsafe content, they also tend +to forget it more significantly than other examples when subsequently finetuned +on safer content. Drawing inspiration from the discrepancies in forgetting, we +introduce the "ForgetFilter" algorithm, which filters unsafe data based on how +strong the model's forgetting signal is for that data. We demonstrate that the +ForgetFilter algorithm ensures safety in customized finetuning without +compromising downstream task performance, unlike sequential safety finetuning. +ForgetFilter outperforms alternative strategies like replay and moral +self-correction in curbing LLMs' ability to assimilate unsafe content during +custom finetuning, e.g. 75% lower than not applying any safety measures and 62% +lower than using self-correction in toxicity score. + +
+
+ comment: accepted by ICML 24 +
+
+
+
+
+ + ♻ ☆ PCL-Indexability and Whittle Index for Restless Bandits with General + Observation Models + + +
+ In this paper, we consider a general observation model for restless +multi-armed bandit problems. The operation of the player needs to be based on +certain feedback mechanism that is error-prone due to resource constraints or +environmental or intrinsic noises. By establishing a general probabilistic +model for dynamics of feedback/observation, we formulate the problem as a +restless bandit with a countable belief state space starting from an arbitrary +initial belief (a priori information). We apply the achievable region method +with partial conservation law (PCL) to the infinite-state problem and analyze +its indexability and priority index (Whittle index). Finally, we propose an +approximation process to transform the problem into which the AG algorithm of +Ni\~no-Mora and Bertsimas for finite-state problems can be applied to. +Numerical experiments show that our algorithm has an excellent performance. + +
+
+
+
+
+ + ♻ ☆ ProSparse: Introducing and Enhancing Intrinsic Activation Sparsity + within Large Language Models + + +
+ Activation sparsity refers to the existence of considerable +weakly-contributed elements among activation outputs. As a prevalent property +of the models using the ReLU activation function, activation sparsity has been +proven a promising paradigm to boost model inference efficiency. Nevertheless, +most large language models (LLMs) adopt activation functions without intrinsic +activation sparsity (e.g., GELU and Swish). Some recent efforts have explored +introducing ReLU or its variants as the substitutive activation function to +help LLMs achieve activation sparsity and inference acceleration, but few can +simultaneously obtain high sparsity and comparable model performance. This +paper introduces a simple and effective sparsification method named "ProSparse" +to push LLMs for higher activation sparsity while maintaining comparable +performance. Specifically, after substituting the activation function of LLMs +with ReLU, ProSparse adopts progressive sparsity regularization with a factor +smoothly increasing along the multi-stage sine curves. This can enhance +activation sparsity and mitigate performance degradation by avoiding radical +shifts in activation distributions. With ProSparse, we obtain high sparsity of +89.32% for LLaMA2-7B, 88.80% for LLaMA2-13B, and 87.89% for end-size +MiniCPM-1B, respectively, achieving comparable performance to their original +Swish-activated versions. These present the most sparsely activated models +among open-source LLaMA versions and competitive end-size models, considerably +surpassing ReluLLaMA-7B (66.98%) and ReluLLaMA-13B (71.56%). Our inference +acceleration experiments further demonstrate the significant practical +acceleration potential of LLMs with higher activation sparsity, obtaining up to +4.52$\times$ inference speedup. + +
+
+ comment: 19 pages, 4 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Tuning-Free Alignment of Diffusion Models with Direct Noise Optimization + + +
+ In this work, we focus on the alignment problem of diffusion models with a +continuous reward function, which represents specific objectives for downstream +tasks, such as improving human preference. The central goal of the alignment +problem is to adjust the distribution learned by diffusion models such that the +generated samples maximize the target reward function. We propose a novel +alignment approach, named Direct Noise Optimization (DNO), that optimizes the +injected noise during the sampling process of diffusion models. By design, DNO +is tuning-free and prompt-agnostic, as the alignment occurs in an online +fashion during generation. We rigorously study the theoretical properties of +DNO and also propose variants to deal with non-differentiable reward functions. +Furthermore, we identify that naive implementation of DNO occasionally suffers +from the out-of-distribution reward hacking problem, where optimized samples +have high rewards but are no longer in the support of the pretrained +distribution. To remedy this issue, we leverage classical high-dimensional +statistics theory and propose to augment the DNO loss with certain probability +regularization. We conduct extensive experiments on several popular reward +functions trained on human feedback data and demonstrate that the proposed DNO +approach achieves state-of-the-art reward scores as well as high image quality, +all within a reasonable time budget for generation. + +
+
+
+
+
+ + ♻ ☆ Swish-T : Enhancing Swish Activation with Tanh Bias for Improved Neural + Network Performance + + +
+ We propose the Swish-T family, an enhancement of the existing non-monotonic +activation function Swish. Swish-T is defined by adding a Tanh bias to the +original Swish function. This modification creates a family of Swish-T +variants, each designed to excel in different tasks, showcasing specific +advantages depending on the application context. The Tanh bias allows for +broader acceptance of negative values during initial training stages, offering +a smoother non-monotonic curve than the original Swish. We ultimately propose +the Swish-T$_{\textbf{C}}$ function, while Swish-T and Swish-T$_{\textbf{B}}$, +byproducts of Swish-T$_{\textbf{C}}$, also demonstrate satisfactory +performance. Furthermore, our ablation study shows that using +Swish-T$_{\textbf{C}}$ as a non-parametric function can still achieve high +performance. The superiority of the Swish-T family has been empirically +demonstrated across various models and benchmark datasets, including MNIST, +Fashion MNIST, SVHN, CIFAR-10, and CIFAR-100. The code is publicly available at +https://github.com/ictseoyoungmin/Swish-T-pytorch. + +
+
+ comment: 11 pages, 6 figures Revised the derivative of the sigmoid function + from 1-sigmoid to sigmoid(1-sigmoid) for correctness.Updated related + equations in Section 3.2. Conclusions to Conclusion in Section 6 +
+
+
+
+
+ + ♻ ☆ Gradient Projection For Continual Parameter-Efficient Tuning + + +
+ Parameter-efficient tunings (PETs) have demonstrated impressive performance +and promising perspectives in training large models, while they are still +confronted with a common problem: the trade-off between learning new content +and protecting old knowledge, e.g., zero-shot generalization ability, and +cross-modal hallucination. In this paper, we reformulate Adapter, LoRA, +Prefix-tuning, and Prompt-tuning from the perspective of gradient projection, +and firstly propose a unified framework called Parameter Efficient Gradient +Projection (PEGP). We introduce orthogonal gradient projection into different +PET paradigms and theoretically demonstrate that the orthogonal condition for +the gradient can effectively resist forgetting even for large-scale models. It +therefore modifies the gradient towards the direction that has less impact on +the old feature space, with less extra memory space and training time. We +extensively evaluate our method with different backbones, including ViT and +CLIP, on diverse datasets, and experiments comprehensively demonstrate its +efficiency in reducing forgetting in class, online class, domain, task, and +multi-modality continual settings. The project page is available at +https://dmcv-ecnu-pegp.github.io/. + +
+
+
+
+
+ + ♻ ☆ DistiLLM: Towards Streamlined Distillation for Large Language Models ICML 2024 + + +
+ Knowledge distillation (KD) is widely used for compressing a teacher model to +a smaller student model, reducing its inference cost and memory footprint while +preserving model capabilities. However, current KD methods for auto-regressive +sequence models (e.g., large language models) suffer from missing a +standardized objective function. Moreover, the recent use of student-generated +outputs to address training-inference mismatches has significantly escalated +computational costs. To tackle these issues, we introduce DistiLLM, a more +effective and efficient KD framework for auto-regressive language models. +DistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence +loss, where we unveil and leverage its theoretical properties, and (2) an +adaptive off-policy approach designed to enhance the efficiency in utilizing +student-generated outputs. Extensive experiments, including +instruction-following tasks, demonstrate the effectiveness of DistiLLM in +building high-performing student models while achieving up to 4.3$\times$ +speedup compared to recent KD methods. + +
+
+ comment: ICML 2024; Code is available at https://github.com/jongwooko/distillm +
+
+
+
+
+ + ♻ ☆ A Fixed-Parameter Tractable Algorithm for Counting Markov Equivalence + Classes with the same Skeleton + + +
+ Causal DAGs (also known as Bayesian networks) are a popular tool for encoding +conditional dependencies between random variables. In a causal DAG, the random +variables are modeled as vertices in the DAG, and it is stipulated that every +random variable is independent of its ancestors conditioned on its parents. It +is possible, however, for two different causal DAGs on the same set of random +variables to encode exactly the same set of conditional dependencies. Such +causal DAGs are said to be Markov equivalent, and equivalence classes of Markov +equivalent DAGs are known as Markov Equivalent Classes (MECs). Beautiful +combinatorial characterizations of MECs have been developed in the past few +decades, and it is known, in particular that all DAGs in the same MEC must have +the same "skeleton" (underlying undirected graph) and v-structures (induced +subgraph of the form $a\rightarrow b \leftarrow c$). + These combinatorial characterizations also suggest several natural +algorithmic questions. One of these is: given an undirected graph $G$ as input, +how many distinct Markov equivalence classes have the skeleton $G$? Much work +has been devoted in the last few years to this and other closely related +problems. However, to the best of our knowledge, a polynomial time algorithm +for the problem remains unknown. + In this paper, we make progress towards this goal by giving a fixed parameter +tractable algorithm for the above problem, with the parameters being the +treewidth and the maximum degree of the input graph $G$. The main technical +ingredient in our work is a construction we refer to as shadow, which lets us +create a "local description" of long-range constraints imposed by the +combinatorial characterizations of MECs. + +
+
+ comment: 75 pages, 2 Figures +
+
+
+
+
+ + ♻ ☆ GLADformer: A Mixed Perspective for Graph-level Anomaly Detection + + +
+ Graph-Level Anomaly Detection (GLAD) aims to distinguish anomalous graphs +within a graph dataset. However, current methods are constrained by their +receptive fields, struggling to learn global features within the graphs. +Moreover, most contemporary methods are based on spatial domain and lack +exploration of spectral characteristics. In this paper, we propose a +multi-perspective hybrid graph-level anomaly detector namely GLADformer, +consisting of two key modules. Specifically, we first design a Graph +Transformer module with global spectrum enhancement, which ensures balanced and +resilient parameter distributions by fusing global features and spectral +distribution characteristics. Furthermore, to uncover local anomalous +attributes, we customize a band-pass spectral GNN message passing module that +further enhances the model's generalization capability. Through comprehensive +experiments on ten real-world datasets from multiple domains, we validate the +effectiveness and robustness of GLADformer. This demonstrates that GLADformer +outperforms current state-of-the-art models in graph-level anomaly detection, +particularly in effectively capturing global anomaly representations and +spectral characteristics. + +
+
+
+
+
+ + ♻ ☆ Learnability in Online Kernel Selection with Memory Constraint via + Data-dependent Regret Analysis + + +
+ Online kernel selection is a fundamental problem of online kernel methods.In +this paper,we study online kernel selection with memory constraint in which the +memory of kernel selection and online prediction procedures is limited to a +fixed budget. An essential question is what is the intrinsic relationship among +online learnability, memory constraint, and data complexity? To answer the +question,it is necessary to show the trade-offs between regret and memory +constraint.Previous work gives a worst-case lower bound depending on the data +size,and shows learning is impossible within a small memory constraint.In +contrast, we present distinct results by offering data-dependent upper bounds +that rely on two data complexities:kernel alignment and the cumulative losses +of competitive hypothesis.We propose an algorithmic framework giving +data-dependent upper bounds for two types of loss functions.For the hinge loss +function,our algorithm achieves an expected upper bound depending on kernel +alignment.For smooth loss functions,our algorithm achieves a high-probability +upper bound depending on the cumulative losses of competitive hypothesis.We +also prove a matching lower bound for smooth loss functions.Our results show +that if the two data complexities are sub-linear,then learning is possible +within a small memory constraint.Our algorithmic framework depends on a new +buffer maintaining framework and a reduction from online kernel selection to +prediction with expert advice. Finally,we empirically verify the prediction +performance of our algorithms on benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ Understanding the Expressive Power and Mechanisms of Transformer for + Sequence Modeling + + +
+ We conduct a systematic study of the approximation properties of Transformer +for sequence modeling with long, sparse and complicated memory. We investigate +the mechanisms through which different components of Transformer, such as the +dot-product self-attention, positional encoding and feed-forward layer, affect +its expressive power, and we study their combined effects through establishing +explicit approximation rates. Our study reveals the roles of critical +parameters in the Transformer, such as the number of layers and the number of +attention heads. These theoretical insights are validated experimentally and +offer natural suggestions for alternative architectures. + +
+
+ comment: 70 pages +
+
+
+
+
+ + ♻ ☆ Injecting linguistic knowledge into BERT for Dialogue State Tracking + + +
+ Dialogue State Tracking (DST) models often employ intricate neural network +architectures, necessitating substantial training data, and their inference +process lacks transparency. This paper proposes a method that extracts +linguistic knowledge via an unsupervised framework and subsequently utilizes +this knowledge to augment BERT's performance and interpretability in DST tasks. +The knowledge extraction procedure is computationally economical and does not +require annotations or additional training data. The injection of the extracted +knowledge can be achieved by the addition of simple neural modules. We employ +the Convex Polytopic Model (CPM) as a feature extraction tool for DST tasks and +illustrate that the acquired features correlate with syntactic and semantic +patterns in the dialogues. This correlation facilitates a comprehensive +understanding of the linguistic features influencing the DST model's +decision-making process. We benchmark this framework on various DST tasks and +observe a notable improvement in accuracy. + +
+
+ comment: Accepted for publication at IEEE Access +
+
+
+
+
+ + ♻ ☆ One Fits All: Learning Fair Graph Neural Networks for Various Sensitive + Attributes KDD 2024 + + +
+ Recent studies have highlighted fairness issues in Graph Neural Networks +(GNNs), where they produce discriminatory predictions against specific +protected groups categorized by sensitive attributes such as race and age. +While various efforts to enhance GNN fairness have made significant progress, +these approaches are often tailored to specific sensitive attributes. +Consequently, they necessitate retraining the model from scratch to accommodate +changes in the sensitive attribute requirement, resulting in high computational +costs. To gain deeper insights into this issue, we approach the graph fairness +problem from a causal modeling perspective, where we identify the confounding +effect induced by the sensitive attribute as the underlying reason. Motivated +by this observation, we formulate the fairness problem in graphs from an +invariant learning perspective, which aims to learn invariant representations +across environments. Accordingly, we propose a graph fairness framework based +on invariant learning, namely FairINV, which enables the training of fair GNNs +to accommodate various sensitive attributes within a single training session. +Specifically, FairINV incorporates sensitive attribute partition and trains +fair GNNs by eliminating spurious correlations between the label and various +sensitive attributes. Experimental results on several real-world datasets +demonstrate that FairINV significantly outperforms state-of-the-art fairness +approaches, underscoring its effectiveness. Our code is available via: +https://github.com/ZzoomD/FairINV/. + +
+
+ comment: Accepted by KDD 2024 +
+
+
+
+
+ + ♻ ☆ LauraGPT: Listen, Attend, Understand, and Regenerate Audio with GPT + + +
+ Generative Pre-trained Transformer (GPT) models have achieved remarkable +performance on various natural language processing tasks, and have shown great +potential as backbones for audio-and-text large language models (LLMs). +Previous mainstream audio-and-text LLMs use discrete audio tokens to represent +both input and output audio; however, they suffer from performance degradation +on tasks such as automatic speech recognition, speech-to-text translation, and +speech enhancement over models using continuous speech features. In this paper, +we propose LauraGPT, a novel unified audio-and-text GPT-based LLM for audio +recognition, understanding, and generation. LauraGPT is a versatile LLM that +can process both audio and text inputs and generate outputs in either +modalities. We propose a novel data representation that combines continuous and +discrete features for audio: LauraGPT encodes input audio into continuous +representations using an audio encoder and generates output audio from discrete +codec codes. We propose a one-step codec vocoder to overcome the prediction +challenge caused by the multimodal distribution of codec tokens. We fine-tune +LauraGPT using supervised multi-task learning. Extensive experiments show that +LauraGPT consistently achieves comparable to superior performance compared to +strong baselines on a wide range of audio tasks related to content, semantics, +paralinguistics, and audio-signal analysis, such as automatic speech +recognition, speech-to-text translation, text-to-speech synthesis, speech +enhancement, automated audio captioning, speech emotion recognition, and spoken +language understanding. + +
+
+ comment: 10 pages, work in progress +
+
+
+
+
+ + ♻ ☆ Instance Temperature Knowledge Distillation + + +
+ Knowledge distillation (KD) enhances the performance of a student network by +allowing it to learn the knowledge transferred from a teacher network +incrementally. Existing methods dynamically adjust the temperature to enable +the student network to adapt to the varying learning difficulties at different +learning stages of KD. KD is a continuous process, but when adjusting the +temperature, these methods consider only the immediate benefits of the +operation in the current learning phase and fail to take into account its +future returns. To address this issue, we formulate the adjustment of +temperature as a sequential decision-making task and propose a method based on +reinforcement learning, termed RLKD. Importantly, we design a novel state +representation to enable the agent to make more informed action (i.e. instance +temperature adjustment). To handle the problem of delayed rewards in our method +due to the KD setting, we explore an instance reward calibration approach. In +addition,we devise an efficient exploration strategy that enables the agent to +learn valuable instance temperature adjustment policy more efficiently. Our +framework can serve as a plug-and-play technique to be inserted into various KD +methods easily, and we validate its effectiveness on both image classification +and object detection tasks. Our project is at +https://www.zayx.me/ITKD.github.io/. + +
+
+
+
+
+ + ♻ ☆ DyGPrompt: Learning Feature and Time Prompts on Dynamic Graphs + + +
+ Dynamic graphs are pervasive in the real world, modeling dynamic relations +between objects across various fields. For dynamic graph modeling, dynamic +graph neural networks (DGNNs) have emerged as a mainstream technique, which are +generally pre-trained on the link prediction task, leaving a significant gap +from the objectives of downstream tasks such as node classification. To bridge +the gap, prompt-based learning has gained traction on graphs. However, existing +efforts focus on static graphs, neglecting the evolution of dynamic graphs. In +this paper, we propose DyGPrompt, a novel pre-training and prompting framework +for dynamic graph modeling. First, we design dual prompts to address the gap in +both task objectives and dynamic variations across pre-training and downstream +tasks. Second, we recognize that node and time features mutually characterize +each other, and propose dual condition-nets to model the evolving node-time +patterns in downstream tasks. Finally, we thoroughly evaluate and analyze +DyGPrompt through extensive experiments on three public datasets. + +
+
+ comment: Under review +
+
+
+
+
+
+
+
+ + Multimedia 13 + +
+
+
+ + ☆ MuDiT & MuSiT: Alignment with Colloquial Expression in + Description-to-Song Generation + + +
+ Amid the rising intersection of generative AI and human artistic processes, +this study probes the critical yet less-explored terrain of alignment in +human-centric automatic song composition. We propose a novel task of Colloquial +Description-to-Song Generation, which focuses on aligning the generated content +with colloquial human expressions. This task is aimed at bridging the gap +between colloquial language understanding and auditory expression within an AI +model, with the ultimate goal of creating songs that accurately satisfy human +auditory expectations and structurally align with musical norms. Current +datasets are limited due to their narrow descriptive scope, semantic gaps and +inaccuracies. To overcome data scarcity in this domain, we present the Caichong +Music Dataset (CaiMD). CaiMD is manually annotated by both professional +musicians and amateurs, offering diverse perspectives and a comprehensive +understanding of colloquial descriptions. Unlike existing datasets pre-set with +expert annotations or auto-generated ones with inherent biases, CaiMD caters +more sufficiently to our purpose of aligning AI-generated music with widespread +user-desired results. Moreover, we propose an innovative single-stage framework +called MuDiT/MuSiT for enabling effective human-machine alignment in song +creation. This framework not only achieves cross-modal comprehension between +colloquial language and auditory music perceptions but also ensures generated +songs align with user-desired results. MuDiT/MuSiT employs one DiT/SiT model +for end-to-end generation of musical components like melody, harmony, rhythm, +vocals, and instrumentation. The approach ensures harmonious sonic cohesiveness +amongst all generated musical components, facilitating better resonance with +human auditory expectations. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ☆ Relating CNN-Transformer Fusion Network for Change Detection + + +
+ While deep learning, particularly convolutional neural networks (CNNs), has +revolutionized remote sensing (RS) change detection (CD), existing approaches +often miss crucial features due to neglecting global context and incomplete +change learning. Additionally, transformer networks struggle with low-level +details. RCTNet addresses these limitations by introducing \textbf{(1)} an +early fusion backbone to exploit both spatial and temporal features early on, +\textbf{(2)} a Cross-Stage Aggregation (CSA) module for enhanced temporal +representation, \textbf{(3)} a Multi-Scale Feature Fusion (MSF) module for +enriched feature extraction in the decoder, and \textbf{(4)} an Efficient +Self-deciphering Attention (ESA) module utilizing transformers to capture +global information and fine-grained details for accurate change detection. +Extensive experiments demonstrate RCTNet's clear superiority over traditional +RS image CD methods, showing significant improvement and an optimal balance +between accuracy and computational cost. + +
+
+ comment: accepted by IEEE Conference on Multimedia Expo +
+
+
+
+
+ + ☆ Design of a UE5-based digital twin platform + + +
+ Aiming at the current mainstream 3D scene engine learning and building cost +is too high, this thesis proposes a digital twin platform design program based +on Unreal Engine 5 (UE5). It aims to provide a universal platform construction +design process to effectively reduce the learning cost of large-scale scene +construction. Taking an actual project of a unit as an example, the overall +cycle work of platform building is explained, and the digital twin and data +visualization technologies and applications based on UE5 are analyzed. By +summarizing the project implementation into a process approach, the +standardization and operability of the process pathway is improved. + +
+
+
+
+
+ + ☆ KeyVideoLLM: Towards Large-scale Video Keyframe Selection + + +
+ Recently, with the rise of web videos, managing and understanding large-scale +video datasets has become increasingly important. Video Large Language Models +(VideoLLMs) have emerged in recent years due to their strong video +understanding capabilities. However, training and inference processes for +VideoLLMs demand vast amounts of data, presenting significant challenges to +data management, particularly regarding efficiency, robustness, and +effectiveness. In this work, we present KeyVideoLLM, a text-video frame +similarity-based keyframe selection method designed to manage VideoLLM data +efficiently, robustly, and effectively. Specifically, KeyVideoLLM achieves a +remarkable data compression rate of up to 60.9 times, substantially lowering +disk space requirements, which proves its high efficiency. Additionally, it +maintains a 100% selection success rate across all video formats and scales, +enhances processing speed by up to 200 times compared to existing keyframe +selection methods, and does not require hyperparameter tuning. Beyond its +outstanding efficiency and robustness, KeyVideoLLM further improves model +performance in video question-answering tasks during both training and +inference stages. Notably, it consistently achieved the state-of-the-art (SoTA) +experimental results on diverse datasets. + +
+
+
+
+
+ + ☆ Differentially Processed Optimized Collaborative Rich Text Editor + + +
+ A collaborative real-time text editor is an application that allows multiple +users to edit a document simultaneously and merge their contributions +automatically. It can be made collaborative by implementing a conflict +resolution algorithm either on the client side (in peer-to-peer collaboration) +or on the server side (when using web sockets and a central server to monitor +state changes). Although web sockets are ideal for real-time text editors, +using multiple collaborative editors on one connection can create problems. +This is because a single web connection cannot monitor which user is +collaborating on which application state, leading to unnecessary network +queries and data being delivered to the wrong state. To address this issue, the +current solution is to open multiple web socket connections, with one web +socket per collaboration application. However, this can add significant +overhead proportional to the number of apps utilized. In this study, we +demonstrate an algorithm that enables using a single web socket for multiple +collaborative applications in a collaborative editor. Our method involves +modifying the socket's code to track which application's shared state is being +worked on and by whom. This allows for the simultaneous collaboration of +multiple states in real-time, with infinite users, without opening a different +socket for each application. Our optimized editor showed an efficiency +improvement of over 96% in access time duration. This approach can be +implemented in other collaborative editors and web applications with similar +architecture to improve performance and eliminate issues arising from network +overload. + +
+
+
+
+
+ + ☆ Contrast then Memorize: Semantic Neighbor Retrieval-Enhanced Inductive + Multimodal Knowledge Graph Completion SIGIR 2024 + + +
+ A large number of studies have emerged for Multimodal Knowledge Graph +Completion (MKGC) to predict the missing links in MKGs. However, fewer studies +have been proposed to study the inductive MKGC (IMKGC) involving emerging +entities unseen during training. Existing inductive approaches focus on +learning textual entity representations, which neglect rich semantic +information in visual modality. Moreover, they focus on aggregating structural +neighbors from existing KGs, which of emerging entities are usually limited. +However, the semantic neighbors are decoupled from the topology linkage and +usually imply the true target entity. In this paper, we propose the IMKGC task +and a semantic neighbor retrieval-enhanced IMKGC framework CMR, where the +contrast brings the helpful semantic neighbors close, and then the memorize +supports semantic neighbor retrieval to enhance inference. Specifically, we +first propose a unified cross-modal contrastive learning to simultaneously +capture the textual-visual and textual-textual correlations of query-entity +pairs in a unified representation space. The contrastive learning increases the +similarity of positive query-entity pairs, therefore making the representations +of helpful semantic neighbors close. Then, we explicitly memorize the knowledge +representations to support the semantic neighbor retrieval. At test time, we +retrieve the nearest semantic neighbors and interpolate them to the +query-entity similarity distribution to augment the final prediction. Extensive +experiments validate the effectiveness of CMR on three inductive MKGC datasets. +Codes are available at https://github.com/OreOZhao/CMR. + +
+
+ comment: Accepted by SIGIR 2024 +
+
+
+
+
+ + ☆ Game-Based Discovery: Harnessing Mini-Games within Primary Games for + Scientific Data Collection and Problem Solving + + +
+ In the popular video game Batman: Arkham Knight, produced by Rocksteady +Studios and released in 2015, the primary protagonist of the game is Batman, a +vigilante dressed as a bat, fighting crime from the shadows in the fictitious +city of Gotham. The game involves a real-world player who takes up the role of +Batman to solve a peculiar side mission wherein they have to reconstruct the +clean DNA sequence of a human and separate it from mutant DNA to manufacture an +antidote to cure the villain. Although this is undoubtedly a fascinating part +of the game, one that was absent in previous Batman games, it showcases an +interesting notion of using mini-games embedded within primary games to achieve +a particular real-world research objective. Although the DNA data used in this +case was not real, there are multiple such instances in video games where +mini-games have been used for an underlying motive besides entertainment. Based +on popular case studies incorporating a similar method, this study +characterizes the methodology of designing mini-games within primary games for +research purposes into a descriptive framework, highlighting the process's +advantages and limitations. It is concluded that these mini-games not only +facilitate a deeper understanding of complex scientific concepts but also +accelerate data processing and analysis by leveraging crowd-sourced human +intuition and pattern recognition capabilities. This paper argues for +strategically incorporating miniaturized, gamified elements into established +video games that are mainly intended for recreational purposes. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ OpenVNA: A Framework for Analyzing the Behavior of Multimodal Language + Understanding System under Noisy Scenarios ACL 2024 + + +
+ We present OpenVNA, an open-source framework designed for analyzing the +behavior of multimodal language understanding systems under noisy conditions. +OpenVNA serves as an intuitive toolkit tailored for researchers, facilitating +convenience batch-level robustness evaluation and on-the-fly instance-level +demonstration. It primarily features a benchmark Python library for assessing +global model robustness, offering high flexibility and extensibility, thereby +enabling customization with user-defined noise types and models. Additionally, +a GUI-based interface has been developed to intuitively analyze local model +behavior. In this paper, we delineate the design principles and utilization of +the created library and GUI-based web platform. Currently, OpenVNA is publicly +accessible at \url{https://github.com/thuiar/OpenVNA}, with a demonstration +video available at \url{https://youtu.be/0Z9cW7RGct4}. + +
+
+ comment: 10 pages, 4 figures, to be published in ACL 2024 System Demonstration + Track +
+
+
+
+
+ + ☆ Multi-Task Decision-Making for Multi-User 360 Video Processing over + Wireless Networks + + +
+ We study a multi-task decision-making problem for 360 video processing in a +wireless multi-user virtual reality (VR) system that includes an edge computing +unit (ECU) to deliver 360 videos to VR users and offer computing assistance for +decoding/rendering of video frames. However, this comes at the expense of +increased data volume and required bandwidth. To balance this trade-off, we +formulate a constrained quality of experience (QoE) maximization problem in +which the rebuffering time and quality variation between video frames are +bounded by user and video requirements. To solve the formulated multi-user QoE +maximization, we leverage deep reinforcement learning (DRL) for multi-task rate +adaptation and computation distribution (MTRC). The proposed MTRC approach does +not rely on any predefined assumption about the environment and relies on video +playback statistics (i.e., past throughput, decoding time, transmission time, +etc.), video information, and the resulting performance to adjust the video +bitrate and computation distribution. We train MTRC with real-world wireless +network traces and 360 video datasets to obtain evaluation results in terms of +the average QoE, peak signal-to-noise ratio (PSNR), rebuffering time, and +quality variation. Our results indicate that the MTRC improves the users' QoE +compared to state-of-the-art rate adaptation algorithm. Specifically, we show a +5.97 dB to 6.44 dB improvement in PSNR, a 1.66X to 4.23X improvement in +rebuffering time, and a 4.21 dB to 4.35 dB improvement in quality variation. + +
+
+ comment: 2024 IEEE International Conference on Multimedia Information + Processing and Retrieval (MIPR) +
+
+
+
+
+ + ♻ ☆ Multimodal Pretraining, Adaptation, and Generation for Recommendation: A + Survey KDD 2024 + + +
+ Personalized recommendation serves as a ubiquitous channel for users to +discover information tailored to their interests. However, traditional +recommendation models primarily rely on unique IDs and categorical features for +user-item matching, potentially overlooking the nuanced essence of raw item +contents across multiple modalities such as text, image, audio, and video. This +underutilization of multimodal data poses a limitation to recommender systems, +especially in multimedia services like news, music, and short-video platforms. +The recent advancements in large multimodal models offer new opportunities and +challenges in developing content-aware recommender systems. This survey seeks +to provide a comprehensive exploration of the latest advancements and future +trajectories in multimodal pretraining, adaptation, and generation techniques, +as well as their applications in enhancing recommender systems. Furthermore, we +discuss current open challenges and opportunities for future research in this +dynamic domain. We believe that this survey, alongside the curated resources, +will provide valuable insights to inspire further advancements in this evolving +landscape. + +
+
+ comment: Accepted by KDD 2024. See our tutorial materials at + https://mmrec.github.io +
+
+
+
+
+ + ♻ ☆ Video Watermarking: Safeguarding Your Video from (Unauthorized) + Annotations by Video-based LLMs + + +
+ The advent of video-based Large Language Models (LLMs) has significantly +enhanced video understanding. However, it has also raised some safety concerns +regarding data protection, as videos can be more easily annotated, even without +authorization. This paper introduces Video Watermarking, a novel technique to +protect videos from unauthorized annotations by such video-based LLMs, +especially concerning the video content and description, in response to +specific queries. By imperceptibly embedding watermarks into key video frames +with multi-modal flow-based losses, our method preserves the viewing experience +while preventing misuse by video-based LLMs. Extensive experiments show that +Video Watermarking significantly reduces the comprehensibility of videos with +various video-based LLMs, demonstrating both stealth and robustness. In +essence, our method provides a solution for securing video content, ensuring +its integrity and confidentiality in the face of evolving video-based LLMs +technologies. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2403.13507 +
+
+
+
+
+ + ♻ ☆ Deep learning for 3D human pose estimation and mesh recovery: A survey + + +
+ 3D human pose estimation and mesh recovery have attracted widespread research +interest in many areas, such as computer vision, autonomous driving, and +robotics. Deep learning on 3D human pose estimation and mesh recovery has +recently thrived, with numerous methods proposed to address different problems +in this area. In this paper, to stimulate future research, we present a +comprehensive review of recent progress over the past five years in deep +learning methods for this area by delving into over 200 references. To the best +of our knowledge, this survey is arguably the first to comprehensively cover +deep learning methods for 3D human pose estimation, including both +single-person and multi-person approaches, as well as human mesh recovery, +encompassing methods based on explicit models and implicit representations. We +also present comparative results on several publicly available datasets, +together with insightful observations and inspiring future research directions. +A regularly updated project page can be found at +https://github.com/liuyangme/SOTA-3DHPE-HMR. + +
+
+
+
+
+ + ♻ ☆ LauraGPT: Listen, Attend, Understand, and Regenerate Audio with GPT + + +
+ Generative Pre-trained Transformer (GPT) models have achieved remarkable +performance on various natural language processing tasks, and have shown great +potential as backbones for audio-and-text large language models (LLMs). +Previous mainstream audio-and-text LLMs use discrete audio tokens to represent +both input and output audio; however, they suffer from performance degradation +on tasks such as automatic speech recognition, speech-to-text translation, and +speech enhancement over models using continuous speech features. In this paper, +we propose LauraGPT, a novel unified audio-and-text GPT-based LLM for audio +recognition, understanding, and generation. LauraGPT is a versatile LLM that +can process both audio and text inputs and generate outputs in either +modalities. We propose a novel data representation that combines continuous and +discrete features for audio: LauraGPT encodes input audio into continuous +representations using an audio encoder and generates output audio from discrete +codec codes. We propose a one-step codec vocoder to overcome the prediction +challenge caused by the multimodal distribution of codec tokens. We fine-tune +LauraGPT using supervised multi-task learning. Extensive experiments show that +LauraGPT consistently achieves comparable to superior performance compared to +strong baselines on a wide range of audio tasks related to content, semantics, +paralinguistics, and audio-signal analysis, such as automatic speech +recognition, speech-to-text translation, text-to-speech synthesis, speech +enhancement, automated audio captioning, speech emotion recognition, and spoken +language understanding. + +
+
+ comment: 10 pages, work in progress +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 143 + +
+
+
+ + ☆ MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via + Dynamic Sparse Attention + + +
+ The computational challenges of Large Language Model (LLM) inference remain a +significant barrier to their widespread deployment, especially as prompt +lengths continue to increase. Due to the quadratic complexity of the attention +computation, it takes 30 minutes for an 8B LLM to process a prompt of 1M tokens +(i.e., the pre-filling stage) on a single A100 GPU. Existing methods for +speeding up prefilling often fail to maintain acceptable accuracy or efficiency +when applied to long-context LLMs. To address this gap, we introduce MInference +(Milliontokens Inference), a sparse calculation method designed to accelerate +pre-filling of long-sequence processing. Specifically, we identify three unique +patterns in long-context attention matrices-the A-shape, Vertical-Slash, and +Block-Sparsethat can be leveraged for efficient sparse computation on GPUs. We +determine the optimal pattern for each attention head offline and dynamically +build sparse indices based on the assigned pattern during inference. With the +pattern and sparse indices, we perform efficient sparse attention calculations +via our optimized GPU kernels to significantly reduce the latency in the +pre-filling stage of long-context LLMs. Our proposed technique can be directly +applied to existing LLMs without any modifications to the pre-training setup or +additional fine-tuning. By evaluating on a wide range of downstream tasks, +including InfiniteBench, RULER, PG-19, and Needle In A Haystack, and models +including LLaMA-3-1M, GLM4-1M, Yi-200K, Phi-3-128K, and Qwen2-128K, we +demonstrate that MInference effectively reduces inference latency by up to 10x +for pre-filling on an A100, while maintaining accuracy. Our code is available +at https://aka.ms/MInference. + +
+
+
+
+
+ + ☆ Neurocache: Efficient Vector Retrieval for Long-range Language Modeling NAACL'24 + + +
+ This paper introduces Neurocache, an approach to extend the effective context +size of large language models (LLMs) using an external vector cache to store +its past states. Like recent vector retrieval approaches, Neurocache uses an +efficient k-nearest-neighbor (kNN) algorithm to retrieve relevant past states +and incorporate them into the attention process. Neurocache improves upon +previous methods by (1) storing compressed states, which reduces cache size; +(2) performing a single retrieval operation per token which increases inference +speed; and (3) extending the retrieval window to neighboring states, which +improves both language modeling and downstream task accuracy. Our experiments +show the effectiveness of Neurocache both for models trained from scratch and +for pre-trained models such as Llama2-7B and Mistral-7B when enhanced with the +cache mechanism. We also compare Neurocache with text retrieval methods and +show improvements in single-document question-answering and few-shot learning +tasks. We made the source code available under: +https://github.com/alisafaya/neurocache + +
+
+ comment: Long paper, published at the main conference NAACL'24 +
+
+
+
+
+ + ☆ RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in + LLMs + + +
+ Large language models (LLMs) typically utilize the top-k contexts from a +retriever in retrieval-augmented generation (RAG). In this work, we propose a +novel instruction fine-tuning framework RankRAG, which instruction-tunes a +single LLM for the dual purpose of context ranking and answer generation in +RAG. In particular, the instruction-tuned LLMs work surprisingly well by adding +a small fraction of ranking data into the training blend, and outperform +existing expert ranking models, including the same LLM exclusively fine-tuned +on a large amount of ranking data. For generation, we compare our model with +many strong baselines, including GPT-4-0613, GPT-4-turbo-2024-0409, and +ChatQA-1.5, an open-sourced model with the state-of-the-art performance on RAG +benchmarks. Specifically, our Llama3-RankRAG significantly outperforms +Llama3-ChatQA-1.5 and GPT-4 models on nine knowledge-intensive benchmarks. In +addition, it also performs comparably to GPT-4 on five RAG benchmarks in the +biomedical domain without instruction fine-tuning on biomedical data, +demonstrating its superb capability for generalization to new domains. + +
+
+
+
+
+ + ☆ MMedAgent: Learning to Use Medical Tools with Multi-modal Agent + + +
+ Multi-Modal Large Language Models (MLLMs), despite being successful, exhibit +limited generality and often fall short when compared to specialized models. +Recently, LLM-based agents have been developed to address these challenges by +selecting appropriate specialized models as tools based on user inputs. +However, such advancements have not been extensively explored within the +medical domain. To bridge this gap, this paper introduces the first agent +explicitly designed for the medical field, named \textbf{M}ulti-modal +\textbf{Med}ical \textbf{Agent} (MMedAgent). We curate an instruction-tuning +dataset comprising six medical tools solving seven tasks, enabling the agent to +choose the most suitable tools for a given task. Comprehensive experiments +demonstrate that MMedAgent achieves superior performance across a variety of +medical tasks compared to state-of-the-art open-source methods and even the +closed-source model, GPT-4o. Furthermore, MMedAgent exhibits efficiency in +updating and integrating new medical tools. + +
+
+
+
+
+ + ☆ Understanding Alignment in Multimodal LLMs: A Comprehensive Study + + +
+ Preference alignment has become a crucial component in enhancing the +performance of Large Language Models (LLMs), yet its impact in Multimodal Large +Language Models (MLLMs) remains comparatively underexplored. Similar to +language models, MLLMs for image understanding tasks encounter challenges like +hallucination. In MLLMs, hallucination can occur not only by stating incorrect +facts but also by producing responses that are inconsistent with the image +content. A primary objective of alignment for MLLMs is to encourage these +models to align responses more closely with image information. Recently, +multiple works have introduced preference datasets for MLLMs and examined +different alignment methods, including Direct Preference Optimization (DPO) and +Proximal Policy Optimization (PPO). However, due to variations in datasets, +base model types, and alignment methods, it remains unclear which specific +elements contribute most significantly to the reported improvements in these +works. In this paper, we independently analyze each aspect of preference +alignment in MLLMs. We start by categorizing the alignment algorithms into two +groups, offline (such as DPO), and online (such as online-DPO), and show that +combining offline and online methods can improve the performance of the model +in certain scenarios. We review a variety of published multimodal preference +datasets and discuss how the details of their construction impact model +performance. Based on these insights, we introduce a novel way of creating +multimodal preference data called Bias-Driven Hallucination Sampling (BDHS) +that needs neither additional annotation nor external models, and show that it +can achieve competitive performance to previously published alignment work for +multimodal models across a range of benchmarks. + +
+
+
+
+
+ + ☆ ValueScope: Unveiling Implicit Norms and Values via Return Potential + Model of Social Interactions + + +
+ This study introduces ValueScope, a framework leveraging language models to +quantify social norms and values within online communities, grounded in social +science perspectives on normative structures. We employ ValueScope to dissect +and analyze linguistic and stylistic expressions across 13 Reddit communities +categorized under gender, politics, science, and finance. Our analysis provides +a quantitative foundation showing that even closely related communities exhibit +remarkably diverse norms. This diversity supports existing theories and adds a +new dimension--community preference--to understanding community interactions. +ValueScope not only delineates differing social norms among communities but +also effectively traces their evolution and the influence of significant +external events like the U.S. presidential elections and the emergence of new +sub-communities. The framework thus highlights the pivotal role of social norms +in shaping online interactions, presenting a substantial advance in both the +theory and application of social norm studies in digital spaces. + +
+
+ comment: First three authors contributed equally. 33 pages. In submission +
+
+
+
+
+ + ☆ Ensemble of pre-trained language models and data augmentation for hate + speech detection from Arabic tweets + + +
+ Today, hate speech classification from Arabic tweets has drawn the attention +of several researchers. Many systems and techniques have been developed to +resolve this classification task. Nevertheless, two of the major challenges +faced in this context are the limited performance and the problem of imbalanced +data. In this study, we propose a novel approach that leverages ensemble +learning and semi-supervised learning based on previously manually labeled. We +conducted experiments on a benchmark dataset by classifying Arabic tweets into +5 distinct classes: non-hate, general hate, racial, religious, or sexism. +Experimental results show that: (1) ensemble learning based on pre-trained +language models outperforms existing related works; (2) Our proposed data +augmentation improves the accuracy results of hate speech detection from Arabic +tweets and outperforms existing related works. Our main contribution is the +achievement of encouraging results in Arabic hate speech detection. + +
+
+
+
+
+ + ☆ Predicting vs. Acting: A Trade-off Between World Modeling & Agent + Modeling + + +
+ RLHF-aligned LMs have shown unprecedented ability on both benchmarks and +long-form text generation, yet they struggle with one foundational task: +next-token prediction. As RLHF models become agent models aimed at interacting +with humans, they seem to lose their world modeling -- the ability to predict +what comes next in arbitrary documents, which is the foundational training +objective of the Base LMs that RLHF adapts. + Besides empirically demonstrating this trade-off, we propose a potential +explanation: to perform coherent long-form generation, RLHF models restrict +randomness via implicit blueprints. In particular, RLHF models concentrate +probability on sets of anchor spans that co-occur across multiple generations +for the same prompt, serving as textual scaffolding but also limiting a model's +ability to generate documents that do not include these spans. We study this +trade-off on the most effective current agent models, those aligned with RLHF, +while exploring why this may remain a fundamental trade-off between models that +act and those that predict, even as alignment techniques improve. + +
+
+
+
+
+ + ☆ Evaluating the Robustness of Adverse Drug Event Classification Models + Using Templates ACL + + +
+ An adverse drug effect (ADE) is any harmful event resulting from medical drug +treatment. Despite their importance, ADEs are often under-reported in official +channels. Some research has therefore turned to detecting discussions of ADEs +in social media. Impressive results have been achieved in various attempts to +detect ADEs. In a high-stakes domain such as medicine, however, an in-depth +evaluation of a model's abilities is crucial. We address the issue of thorough +performance evaluation in English-language ADE detection with hand-crafted +templates for four capabilities: Temporal order, negation, sentiment, and +beneficial effect. We find that models with similar performance on held-out +test sets have varying results on these capabilities. + +
+
+ comment: Accepted at BioNLP 2024 and Shared Tasks (ACL Workshop) +
+
+
+
+
+ + ☆ CEB: Compositional Evaluation Benchmark for Fairness in Large Language + Models + + +
+ As Large Language Models (LLMs) are increasingly deployed to handle various +natural language processing (NLP) tasks, concerns regarding the potential +negative societal impacts of LLM-generated content have also arisen. To +evaluate the biases exhibited by LLMs, researchers have recently proposed a +variety of datasets. However, existing bias evaluation efforts often focus on +only a particular type of bias and employ inconsistent evaluation metrics, +leading to difficulties in comparison across different datasets and LLMs. To +address these limitations, we collect a variety of datasets designed for the +bias evaluation of LLMs, and further propose CEB, a Compositional Evaluation +Benchmark that covers different types of bias across different social groups +and tasks. The curation of CEB is based on our newly proposed compositional +taxonomy, which characterizes each dataset from three dimensions: bias types, +social groups, and tasks. By combining the three dimensions, we develop a +comprehensive evaluation strategy for the bias in LLMs. Our experiments +demonstrate that the levels of bias vary across these dimensions, thereby +providing guidance for the development of specific bias mitigation methods. + +
+
+ comment: 37 pages, 32 figures +
+
+
+
+
+ + ☆ Learning to Refine with Fine-Grained Natural Language Feedback + + +
+ Recent work has explored the capability of large language models (LLMs) to +identify and correct errors in LLM-generated responses. These refinement +approaches frequently evaluate what sizes of models are able to do refinement +for what problems, but less attention is paid to what effective feedback for +refinement looks like. In this work, we propose looking at refinement with +feedback as a composition of three distinct LLM competencies: (1) +identification of bad generations; (2) fine-grained natural language feedback +generation; (3) refining with fine-grained feedback. The first step can be +implemented with a high-performing discriminative model and steps 2 and 3 can +be implemented either via prompted or fine-tuned LLMs. A key property of this +approach is that the step 2 critique model can give fine-grained feedback about +errors, made possible by offloading the discrimination to a separate model in +step 1. We show that models of different capabilities benefit from refining +with this approach on the task of improving factual consistency of document +grounded summaries. Overall, our proposed method consistently outperforms +existing end-to-end refinement approaches and current trained models not +fine-tuned for factuality critiquing. + +
+
+ comment: Code and models available at: https://github.com/ManyaWadhwa/DCR +
+
+
+
+
+ + ☆ Is Your AI-Generated Code Really Secure? Evaluating Large Language + Models on Secure Code Generation with CodeSecEval + + +
+ Large language models (LLMs) have brought significant advancements to code +generation and code repair, benefiting both novice and experienced developers. +However, their training using unsanitized data from open-source repositories, +like GitHub, raises the risk of inadvertently propagating security +vulnerabilities. Despite numerous studies investigating the safety of code +LLMs, there remains a gap in comprehensively addressing their security +features. In this work, we aim to present a comprehensive study aimed at +precisely evaluating and enhancing the security aspects of code LLMs. To +support our research, we introduce CodeSecEval, a meticulously curated dataset +designed to address 44 critical vulnerability types with 180 distinct samples. +CodeSecEval serves as the foundation for the automatic evaluation of code +models in two crucial tasks: code generation and code repair, with a strong +emphasis on security. Our experimental results reveal that current models +frequently overlook security issues during both code generation and repair +processes, resulting in the creation of vulnerable code. In response, we +propose different strategies that leverage vulnerability-aware information and +insecure code explanations to mitigate these security vulnerabilities. +Furthermore, our findings highlight that certain vulnerability types +particularly challenge model performance, influencing their effectiveness in +real-world applications. Based on these findings, we believe our study will +have a positive impact on the software engineering community, inspiring the +development of improved methods for training and utilizing LLMs, thereby +leading to safer and more trustworthy model deployment. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2310.16263 +
+
+
+
+
+ + ☆ SafaRi:Adaptive Sequence Transformer for Weakly Supervised Referring + Expression Segmentation ECCV 2024 + + +
+ Referring Expression Segmentation (RES) aims to provide a segmentation mask +of the target object in an image referred to by the text (i.e., referring +expression). Existing methods require large-scale mask annotations. Moreover, +such approaches do not generalize well to unseen/zero-shot scenarios. To +address the aforementioned issues, we propose a weakly-supervised bootstrapping +architecture for RES with several new algorithmic innovations. To the best of +our knowledge, ours is the first approach that considers only a fraction of +both mask and box annotations (shown in Figure 1 and Table 1) for training. To +enable principled training of models in such low-annotation settings, improve +image-text region-level alignment, and further enhance spatial localization of +the target object in the image, we propose Cross-modal Fusion with Attention +Consistency module. For automatic pseudo-labeling of unlabeled samples, we +introduce a novel Mask Validity Filtering routine based on a spatially aware +zero-shot proposal scoring approach. Extensive experiments show that with just +30% annotations, our model SafaRi achieves 59.31 and 48.26 mIoUs as compared to +58.93 and 48.19 mIoUs obtained by the fully-supervised SOTA method SeqTR +respectively on RefCOCO+@testA and RefCOCO+testB datasets. SafaRi also +outperforms SeqTR by 11.7% (on RefCOCO+testA) and 19.6% (on RefCOCO+testB) in a +fully-supervised setting and demonstrates strong generalization capabilities in +unseen/zero-shot tasks. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ Talking to Machines: do you read me? + + +
+ In this dissertation I would like to guide the reader to the research on +dialogue but more precisely the research I have conducted during my career +since my PhD thesis. Starting from modular architectures with machine +learning/deep learning and reinforcement learning to end-to-end deep neural +networks. Besides my work as research associate, I also present the work I have +supervised in the last years. + I review briefly the state of the art and highlight the open research +problems on conversational agents. Afterwards, I present my contribution to +Task-Oriented Dialogues (TOD), both as research associate and as the industrial +supervisor of CIFRE theses. I discuss conversational QA. Particularly, I +present the work of two PhD candidates Thibault Cordier and Sebastien Montella; +as well as the work of the young researcher Quentin Brabant. Finally, I present +the scientific project, where I discuss about Large Language Models (LLMs) for +Task-Oriented Dialogue and Multimodal Task-Oriented Dialogue. + +
+
+ comment: French Doctoral Habilitation HDR manuscript: + https://hal.science/tel-04620199 +
+
+
+
+
+ + ☆ Pelican: Correcting Hallucination in Vision-LLMs via Claim Decomposition + and Program of Thought Verification + + +
+ Large Visual Language Models (LVLMs) struggle with hallucinations in visual +instruction following task(s), limiting their trustworthiness and real-world +applicability. We propose Pelican -- a novel framework designed to detect and +mitigate hallucinations through claim verification. Pelican first decomposes +the visual claim into a chain of sub-claims based on first-order predicates. +These sub-claims consist of (predicate, question) pairs and can be +conceptualized as nodes of a computational graph. We then use +Program-of-Thought prompting to generate Python code for answering these +questions through flexible composition of external tools. Pelican improves over +prior work by introducing (1) intermediate variables for precise grounding of +object instances, and (2) shared computation for answering the sub-question to +enable adaptive corrections and inconsistency identification. We finally use +reasoning abilities of LLM to verify the correctness of the the claim by +considering the consistency and confidence of the (question, answer) pairs from +each sub-claim. Our experiments reveal a drop in hallucination rate by +$\sim$8%-32% across various baseline LVLMs and a 27% drop compared to +approaches proposed for hallucination mitigation on MMHal-Bench. Results on two +other benchmarks further corroborate our results. + +
+
+
+
+
+ + ☆ Generative Large Language Models in Automated Fact-Checking: A Survey + + +
+ The dissemination of false information across online platforms poses a +serious societal challenge, necessitating robust measures for information +verification. While manual fact-checking efforts are still instrumental, the +growing volume of false information requires automated methods. Large language +models (LLMs) offer promising opportunities to assist fact-checkers, leveraging +LLM's extensive knowledge and robust reasoning capabilities. In this survey +paper, we investigate the utilization of generative LLMs in the realm of +fact-checking, illustrating various approaches that have been employed and +techniques for prompting or fine-tuning LLMs. By providing an overview of +existing approaches, this survey aims to improve the understanding of utilizing +LLMs in fact-checking and to facilitate further progress in LLMs' involvement +in this process. + +
+
+
+
+
+ + ☆ MORPHEUS: Modeling Role from Personalized Dialogue History by Exploring + and Utilizing Latent Space + + +
+ Personalized Dialogue Generation (PDG) aims to create coherent responses +according to roles or personas. Traditional PDG relies on external role data, +which can be scarce and raise privacy concerns. Approaches address these issues +by extracting role information from dialogue history, which often fail to +generically model roles in continuous space. To overcome these limitations, we +introduce a novel framework \textbf{MO}dels \textbf{R}oles from +\textbf{P}ersonalized Dialogue \textbf{H}istory by \textbf{E}xploring and +\textbf{U}tilizing Latent \textbf{S}pace (MORPHEUS) through a three-stage +training process. Specifically, we create a persona codebook to represent roles +in latent space compactly, and this codebook is used to construct a posterior +distribution of role information. This method enables the model to generalize +across roles, allowing the generation of personalized dialogues even for unseen +roles. Experiments on both Chinese and English datasets demonstrate that +MORPHEUS enhances the extraction of role information, and improves response +generation without external role data. Additionally, MORPHEUS can be considered +an efficient fine-tuning for large language models. + +
+
+
+
+
+ + ☆ RVISA: Reasoning and Verification for Implicit Sentiment Analysis + + +
+ With an increasing social demand for fine-grained sentiment analysis (SA), +implicit sentiment analysis (ISA) poses a significant challenge with the +absence of salient cue words in expressions. It necessitates reliable reasoning +to understand how the sentiment is aroused and thus determine implicit +sentiments. In the era of Large Language Models (LLMs), Encoder-Decoder (ED) +LLMs have gained popularity to serve as backbone models for SA applications, +considering impressive text comprehension and reasoning ability among diverse +tasks. On the other hand, Decoder-only (DO) LLMs exhibit superior natural +language generation and in-context learning capabilities. However, their +responses may contain misleading or inaccurate information. To identify +implicit sentiment with reliable reasoning, this study proposes RVISA, a +two-stage reasoning framework that harnesses the generation ability of DO LLMs +and the reasoning ability of ED LLMs to train an enhanced reasoner. +Specifically, we adopt three-hop reasoning prompting to explicitly furnish +sentiment elements as cues. The generated rationales are utilized to fine-tune +an ED LLM into a skilled reasoner. Additionally, we develop a straightforward +yet effective verification mechanism to ensure the reliability of the reasoning +learning. We evaluated the proposed method on two benchmark datasets and +achieved state-of-the-art results in ISA performance. + +
+
+ comment: 11 pages, 6 figures, and 4 tables +
+
+
+
+
+ + ☆ Open foundation models for Azerbaijani language + + +
+ The emergence of multilingual large language models has enabled the +development of language understanding and generation systems in Azerbaijani. +However, most of the production-grade systems rely on cloud solutions, such as +GPT-4. While there have been several attempts to develop open foundation models +for Azerbaijani, these works have not found their way into common use due to a +lack of systemic benchmarking. This paper encompasses several lines of work +that promote open-source foundation models for Azerbaijani. We introduce (1) a +large text corpus for Azerbaijani, (2) a family of encoder-only language models +trained on this dataset, (3) labeled datasets for evaluating these models, and +(4) extensive evaluation that covers all major open-source models with +Azerbaijani support. + +
+
+ comment: Accepted to the 1st SIGTURK Workshop +
+
+
+
+
+ + ☆ Why do LLaVA Vision-Language Models Reply to Images in English? + + +
+ We uncover a surprising multilingual bias occurring in a popular class of +multimodal vision-language models (VLMs). Including an image in the query to a +LLaVA-style VLM significantly increases the likelihood of the model returning +an English response, regardless of the language of the query. This paper +investigates the causes of this loss with a two-pronged approach that combines +extensive ablation of the design space with a mechanistic analysis of the +models' internal representations of image and text inputs. Both approaches +indicate that the issue stems in the language modelling component of the LLaVA +model. Statistically, we find that switching the language backbone for a +bilingual language model has the strongest effect on reducing this error. +Mechanistically, we provide compelling evidence that visual inputs are not +mapped to a similar space as text ones, and that intervening on intermediary +attention layers can reduce this bias. Our findings provide important insights +to researchers and engineers seeking to understand the crossover between +multimodal and multilingual spaces, and contribute to the goal of developing +capable and inclusive VLMs for non-English contexts. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ☆ Efficient Sparse Attention needs Adaptive Token Release ACL 2024 + + +
+ In recent years, Large Language Models (LLMs) have demonstrated remarkable +capabilities across a wide array of text-centric tasks. However, their `large' +scale introduces significant computational and storage challenges, particularly +in managing the key-value states of the transformer, which limits their wider +applicability. Therefore, we propose to adaptively release resources from +caches and rebuild the necessary key-value states. Particularly, we accomplish +this by a lightweight controller module to approximate an ideal top-$K$ sparse +attention. This module retains the tokens with the highest top-$K$ attention +weights and simultaneously rebuilds the discarded but necessary tokens, which +may become essential for future decoding. Comprehensive experiments in natural +language generation and modeling reveal that our method is not only competitive +with full attention in terms of performance but also achieves a significant +throughput improvement of up to 221.8%. The code for replication is available +on the https://github.com/WHUIR/ADORE. + +
+
+ comment: Accepted at ACL 2024(Findings) +
+
+
+
+
+ + ☆ Exploring the Role of Transliteration in In-Context Learning for + Low-resource Languages Written in Non-Latin Scripts + + +
+ Decoder-only large language models (LLMs) excel in high-resource languages +across various tasks through few-shot or even zero-shot in-context learning +(ICL). However, their performance often does not transfer well to low-resource +languages, especially those written in non-Latin scripts. Inspired by recent +work that leverages transliteration in encoder-only models, we investigate +whether transliteration is also effective in improving LLMs' performance for +low-resource languages written in non-Latin scripts. To this end, we propose +three prompt templates, where the target-language text is represented in (1) +its original script, (2) Latin script, or (3) both. We apply these methods to +several representative LLMs of different sizes on various tasks including text +classification and sequential labeling. Our findings show that the +effectiveness of transliteration varies by task type and model size. For +instance, all models benefit from transliterations for sequential labeling +(with increases of up to 25%). + +
+
+
+
+
+ + ☆ Soft Language Prompts for Language Transfer + + +
+ Cross-lingual knowledge transfer, especially between high- and low-resource +languages, remains a challenge in natural language processing (NLP). This study +offers insights for improving cross-lingual NLP applications through the +combination of parameter-efficient fine-tuning methods. We systematically +explore strategies for enhancing this cross-lingual transfer through the +incorporation of language-specific and task-specific adapters and soft prompts. +We present a detailed investigation of various combinations of these methods, +exploring their efficiency across six languages, focusing on three low-resource +languages, including the to our knowledge first use of soft language prompts. +Our findings demonstrate that in contrast to claims of previous work, a +combination of language and task adapters does not always work best; instead, +combining a soft language prompt with a task adapter outperforms other +configurations in many cases. + +
+
+
+
+
+ + ☆ Evaluating the Ability of LLMs to Solve Semantics-Aware Process Mining + Tasks + + +
+ The process mining community has recently recognized the potential of large +language models (LLMs) for tackling various process mining tasks. Initial +studies report the capability of LLMs to support process analysis and even, to +some extent, that they are able to reason about how processes work. This latter +property suggests that LLMs could also be used to tackle process mining tasks +that benefit from an understanding of process behavior. Examples of such tasks +include (semantic) anomaly detection and next activity prediction, which both +involve considerations of the meaning of activities and their inter-relations. +In this paper, we investigate the capabilities of LLMs to tackle such +semantics-aware process mining tasks. Furthermore, whereas most works on the +intersection of LLMs and process mining only focus on testing these models out +of the box, we provide a more principled investigation of the utility of LLMs +for process mining, including their ability to obtain process mining knowledge +post-hoc by means of in-context learning and supervised fine-tuning. +Concretely, we define three process mining tasks that benefit from an +understanding of process semantics and provide extensive benchmarking datasets +for each of them. Our evaluation experiments reveal that (1) LLMs fail to solve +challenging process mining tasks out of the box and when provided only a +handful of in-context examples, (2) but they yield strong performance when +fine-tuned for these tasks, consistently surpassing smaller, encoder-based +language models. + +
+
+ comment: Submitted to ICPM +
+
+
+
+
+ + ☆ Towards Human Understanding of Paraphrase Types in ChatGPT + + +
+ Paraphrases represent a human's intuitive ability to understand expressions +presented in various different ways. Current paraphrase evaluations of language +models primarily use binary approaches, offering limited interpretability of +specific text changes. Atomic paraphrase types (APT) decompose paraphrases into +different linguistic changes and offer a granular view of the flexibility in +linguistic expression (e.g., a shift in syntax or vocabulary used). In this +study, we assess the human preferences towards ChatGPT in generating English +paraphrases with ten APTs and five prompting techniques. We introduce APTY +(Atomic Paraphrase TYpes), a dataset of 500 sentence-level and word-level +annotations by 15 annotators. The dataset also provides a human preference +ranking of paraphrases with different types that can be used to fine-tune +models with RLHF and DPO methods. Our results reveal that ChatGPT can generate +simple APTs, such as additions and deletions, but struggle with complex +structures (e.g., subordination changes). This study contributes to +understanding which aspects of paraphrasing language models have already +succeeded at understanding and what remains elusive. In addition, our curated +datasets can be used to develop language models with specific linguistic +capabilities. + +
+
+
+
+
+ + ☆ CFinBench: A Comprehensive Chinese Financial Benchmark for Large + Language Models + + +
+ Large language models (LLMs) have achieved remarkable performance on various +NLP tasks, yet their potential in more challenging and domain-specific task, +such as finance, has not been fully explored. In this paper, we present +CFinBench: a meticulously crafted, the most comprehensive evaluation benchmark +to date, for assessing the financial knowledge of LLMs under Chinese context. +In practice, to better align with the career trajectory of Chinese financial +practitioners, we build a systematic evaluation from 4 first-level categories: +(1) Financial Subject: whether LLMs can memorize the necessary basic knowledge +of financial subjects, such as economics, statistics and auditing. (2) +Financial Qualification: whether LLMs can obtain the needed financial qualified +certifications, such as certified public accountant, securities qualification +and banking qualification. (3) Financial Practice: whether LLMs can fulfill the +practical financial jobs, such as tax consultant, junior accountant and +securities analyst. (4) Financial Law: whether LLMs can meet the requirement of +financial laws and regulations, such as tax law, insurance law and economic +law. CFinBench comprises 99,100 questions spanning 43 second-level categories +with 3 question types: single-choice, multiple-choice and judgment. We conduct +extensive experiments of 50 representative LLMs with various model size on +CFinBench. The results show that GPT4 and some Chinese-oriented models lead the +benchmark, with the highest average accuracy being 60.16%, highlighting the +challenge presented by CFinBench. The dataset and evaluation code are available +at https://cfinbench.github.io/. + +
+
+
+
+
+ + ☆ Renard: A Modular Pipeline for Extracting Character Networks from + Narrative Texts + + +
+ Renard (Relationships Extraction from NARrative Documents) is a Python +library that allows users to define custom natural language processing (NLP) +pipelines to extract character networks from narrative texts. Contrary to the +few existing tools, Renard can extract dynamic networks, as well as the more +common static networks. Renard pipelines are modular: users can choose the +implementation of each NLP subtask needed to extract a character network. This +allows users to specialize pipelines to particular types of texts and to study +the impact of each subtask on the extracted network. + +
+
+ comment: Accepted at JOSS +
+
+
+
+
+ + ☆ Multilingual Trolley Problems for Language Models + + +
+ As large language models (LLMs) are deployed in more and more real-world +situations, it is crucial to understand their decision-making when faced with +moral dilemmas. Inspired by a large-scale cross-cultural study of human moral +preferences, "The Moral Machine Experiment", we set up the same set of moral +choices for LLMs. We translate 1K vignettes of moral dilemmas, parametrically +varied across key axes, into 100+ languages, and reveal the preferences of LLMs +in each of these languages. We then compare the responses of LLMs to that of +human speakers of those languages, harnessing a dataset of 40 million human +moral judgments. We discover that LLMs are more aligned with human preferences +in languages such as English, Korean, Hungarian, and Chinese, but less aligned +in languages such as Hindi and Somali (in Africa). Moreover, we characterize +the explanations LLMs give for their moral choices and find that fairness is +the most dominant supporting reason behind GPT-4's decisions and utilitarianism +by GPT-3. We also discover "language inequality" (which we define as the +model's different development levels in different languages) in a series of +meta-properties of moral decision making. + +
+
+
+
+
+ + ☆ Robust Zero-Shot Text-to-Speech Synthesis with Reverse Inference + Optimization + + +
+ In this paper, we propose reverse inference optimization (RIO), a simple and +effective method designed to enhance the robustness of +autoregressive-model-based zero-shot text-to-speech (TTS) systems using +reinforcement learning from human feedback (RLHF). To assess the quality of +speech produced by the TTS system without human annotations, RIO introduces a +novel concept termed as reverse inference based on the Bayesian principle, +which suggests that a high-quality generated speech should be able to be used +as a prompt for subsequent generation using the same TTS model. By leveraging +reverse inference as the standard to select exemplars used in RLHF from the +speech samples generated by the TTS system itself, RIO steers the subsequent +optimization towards a direction of enhancing the TTS robustness. The RIO +framework, comprising sampling, automatic annotating, and learning, obviates +the need for a reward model or pairwise preference data, and significantly +improves the stability of zero-shot TTS performance by reducing the +discrepancies between training and inference conditions. Our experimental +results verify that RIO can effectively improve both subjective and objective +metrics, including mean opinion scores, word error rates, and speaker +similarity. Remarkably, RIO can also diminish the incidence of bad outputs to +nearly zero percent, rivalling the robustness when using ground-truth speech as +the prompt. + +
+
+ comment: 12 pages, Work in progress +
+
+
+
+
+ + ☆ Towards a Holistic Framework for Multimodal Large Language Models in + Three-dimensional Brain CT Report Generation + + +
+ Multi-modal large language models (MLLMs) have been given free rein to +explore exciting medical applications with a primary focus on radiology report +generation. Nevertheless, the preliminary success in 2D radiology captioning is +incompetent to reflect the real-world diagnostic challenge in the volumetric 3D +anatomy. To mitigate three crucial limitation aspects in the existing +literature, including (1) data complexity, (2) model capacity, and (3) +evaluation metric fidelity, we collected an 18,885 text-scan pairs 3D-BrainCT +dataset and applied clinical visual instruction tuning (CVIT) to train BrainGPT +models to generate radiology-adherent 3D brain CT reports. Statistically, our +BrainGPT scored BLEU-1 = 44.35, BLEU-4 = 20.38, METEOR = 30.13, ROUGE-L = 47.6, +and CIDEr-R = 211.77 during internal testing and demonstrated an accuracy of +0.91 in captioning midline shifts on the external validation CQ500 dataset. By +further inspecting the captioned report, we reported that the traditional +metrics appeared to measure only the surface text similarity and failed to +gauge the information density of the diagnostic purpose. To close this gap, we +proposed a novel Feature-Oriented Radiology Task Evaluation (FORTE) to estimate +the report's clinical relevance (lesion feature and landmarks). Notably, the +BrainGPT model scored an average FORTE F1-score of 0.71 (degree=0.661; +landmark=0.706; feature=0.693; impression=0.779). To demonstrate that BrainGPT +models possess objective readiness to generate human-like radiology reports, we +conducted a Turing test that enrolled 11 physician evaluators, and around 74% +of the BrainGPT-generated captions were indistinguishable from those written by +humans. Our work embodies a holistic framework that showcased the first-hand +experience of curating a 3D brain CT dataset, fine-tuning anatomy-sensible +language models, and proposing robust radiology evaluation metrics. + +
+
+ comment: 6 figures, 5 supplementary figures, 8 supplementary tables +
+
+
+
+
+ + ☆ Synthetic Multimodal Question Generation + + +
+ Multimodal Retrieval Augmented Generation (MMRAG) is a powerful approach to +question-answering over multimodal documents. A key challenge with evaluating +MMRAG is the paucity of high-quality datasets matching the question styles and +modalities of interest. In light of this, we propose SMMQG, a synthetic data +generation framework. SMMQG leverages interplay between a retriever, large +language model (LLM) and large multimodal model (LMM) to generate question and +answer pairs directly from multimodal documents, with the questions conforming +to specified styles and modalities. We use SMMQG to generate an MMRAG dataset +of 1024 questions over Wikipedia documents and evaluate state-of-the-art models +using it, revealing insights into model performance that are attainable only +through style- and modality-specific evaluation data. Next, we measure the +quality of data produced by SMMQG via a human study. We find that the quality +of our synthetic data is on par with the quality of the crowdsourced benchmark +MMQA and that downstream evaluation results using both datasets strongly +concur. + +
+
+ comment: Submitted to ARR June 2024 +
+
+
+
+
+ + ☆ PromptIntern: Saving Inference Costs by Internalizing Recurrent Prompt + during Large Language Model Fine-tuning + + +
+ Large language models (LLMs) have played a fundamental role in various +natural language processing tasks with powerful prompt techniques. However, in +real-world applications, there are often similar prompt components for repeated +queries, which causes significant computational burdens during inference. +Existing prompt compression and direct fine-tuning methods aim to tackle these +challenges, yet they frequently struggle to strike an optimal balance between +cost-efficiency and performance effectiveness, especially in complex tasks such +as NL2Code. In this paper, we propose a novel method namely PromptIntern to +internalize the prompt knowledge into model parameters via progressive +fine-tuning. Our method enables LLMs to emulate the human learning process for +a new task, where detailed templates and examples in a prompt are gradually +internalized and phased out progressively as the model grows accustomed to the +task. Extensive experiments demonstrate that our method reduces inference +tokens over 90%, speedups inference by 4.2 times, and saves 88.3% monetary +cost. + +
+
+
+
+
+ + ☆ Generative Monoculture in Large Language Models + + +
+ We introduce {\em generative monoculture}, a behavior observed in large +language models (LLMs) characterized by a significant narrowing of model output +diversity relative to available training data for a given task: for example, +generating only positive book reviews for books with a mixed reception. While +in some cases, generative monoculture enhances performance (e.g., LLMs more +often produce efficient code), the dangers are exacerbated in others (e.g., +LLMs refuse to share diverse opinions). As LLMs are increasingly used in +high-impact settings such as education and web search, careful maintenance of +LLM output diversity is essential to ensure a variety of facts and perspectives +are preserved over time. We experimentally demonstrate the prevalence of +generative monoculture through analysis of book review and code generation +tasks, and find that simple countermeasures such as altering sampling or +prompting strategies are insufficient to mitigate the behavior. Moreover, our +results suggest that the root causes of generative monoculture are likely +embedded within the LLM's alignment processes, suggesting a need for developing +fine-tuning paradigms that preserve or promote diversity. + +
+
+
+
+
+ + ☆ How to Learn in a Noisy World? Self-Correcting the Real-World Data Noise + on Machine Translation + + +
+ The massive amounts of web-mined parallel data contain large amounts of +noise. Semantic misalignment, as the primary source of the noise, poses a +challenge for training machine translation systems. In this paper, we first +study the impact of real-world hard-to-detect misalignment noise by proposing a +process to simulate the realistic misalignment controlled by semantic +similarity. After quantitatively analyzing the impact of simulated misalignment +on machine translation, we show the limited effectiveness of widely used +pre-filters to improve the translation performance, underscoring the necessity +of more fine-grained ways to handle data noise. By observing the increasing +reliability of the model's self-knowledge for distinguishing misaligned and +clean data at the token-level, we propose a self-correction approach which +leverages the model's prediction distribution to revise the training +supervision from the ground-truth data over training time. Through +comprehensive experiments, we show that our self-correction method not only +improves translation performance in the presence of simulated misalignment +noise but also proves effective for real-world noisy web-mined datasets across +eight translation tasks. + +
+
+
+
+
+ + ☆ Automatic Adaptation Rule Optimization via Large Language Models + + +
+ Rule-based adaptation is a foundational approach to self-adaptation, +characterized by its human readability and rapid response. However, building +high-performance and robust adaptation rules is often a challenge because it +essentially involves searching the optimal design in a complex (variables) +space. In response, this paper attempt to employ large language models (LLMs) +as a optimizer to construct and optimize adaptation rules, leveraging the +common sense and reasoning capabilities inherent in LLMs. Preliminary +experiments conducted in SWIM have validated the effectiveness and limitation +of our method. + +
+
+
+
+
+ + ☆ LlamAr & GemmAr: Enhancing LLMs Through Arabic Instruction-Tuning + + +
+ Large language models (LLMs) have greatly impacted the natural language +processing (NLP) field, particularly for the English language. These models +have demonstrated capabilities in understanding and generating human-like text. +The success of language models largely depends on the availability of +high-quality instruction datasets, which consist of detailed task descriptions +and corresponding responses that are essential for training the models to +accurately address a variety of prompts. However, the availability and quality +of these resources vary by language. While models perform well in English, they +often struggle with languages like Arabic, due to the lack of datasets for +fine-tuning Arabic-specific tasks. To address this issue, we introduce +InstAr-500k, a new Arabic instruction dataset created by generating and +collecting content that covers several domains and instruction types. We then +assess this dataset by fine-tuning two open-source models, Llama-3-8B-Instruct +and Gemma-7B-IT, on several downstream tasks to scale improvements in their +functionality. Based on multiple evaluations, our fine-tuned models achieve +state-of-the-art performance on several Arabic NLP benchmarks. These outcomes +emphasize the effectiveness of our dataset in elevating the capabilities of +language models for Arabic. Our instruction dataset bridges the performance gap +between English and Arabic language models by providing resources that amplify +Arabic NLP development. Building on this foundation, we developed two +state-of-the-art models, LlamAr-8B and GemmAr-7B, which are specifically tuned +to excel at a wide range of Arabic NLP tasks. + +
+
+
+
+
+ + ☆ Efficient Nearest Neighbor based Uncertainty Estimation for Natural + Language Processing Tasks + + +
+ Trustworthy prediction in Deep Neural Networks (DNNs), including Pre-trained +Language Models (PLMs) is important for safety-critical applications in the +real world. However, DNNs often suffer from uncertainty estimation, such as +miscalibration. In particular, approaches that require multiple stochastic +inference can mitigate this problem, but the expensive cost of inference makes +them impractical. In this study, we propose $k$-Nearest Neighbor Uncertainty +Estimation ($k$NN-UE), which is an uncertainty estimation method that uses the +distances from the neighbors and label-existence ratio of neighbors. +Experiments on sentiment analysis, natural language inference, and named entity +recognition show that our proposed method outperforms the baselines or recent +density-based methods in confidence calibration, selective prediction, and +out-of-distribution detection. Moreover, our analyses indicate that introducing +dimension reduction or approximate nearest neighbor search inspired by recent +$k$NN-LM studies reduces the inference overhead without significantly degrading +estimation performance when combined them appropriately. + +
+
+
+
+
+ + ☆ Black Big Boxes: Do Language Models Hide a Theory of Adjective Order? + + +
+ In English and other languages, multiple adjectives in a complex noun phrase +show intricate ordering patterns that have been a target of much linguistic +theory. These patterns offer an opportunity to assess the ability of language +models (LMs) to learn subtle rules of language involving factors that cross the +traditional divisions of syntax, semantics, and pragmatics. We review existing +hypotheses designed to explain Adjective Order Preferences (AOPs) in humans and +develop a setup to study AOPs in LMs: we present a reusable corpus of adjective +pairs and define AOP measures for LMs. With these tools, we study a series of +LMs across intermediate checkpoints during training. We find that all models' +predictions are much closer to human AOPs than predictions generated by factors +identified in theoretical linguistics. At the same time, we demonstrate that +the observed AOPs in LMs are strongly correlated with the frequency of the +adjective pairs in the training data and report limited generalization to +unseen combinations. This highlights the difficulty in establishing the link +between LM performance and linguistic theory. We therefore conclude with a road +map for future studies our results set the stage for, and a discussion of key +questions about the nature of knowledge in LMs and their ability to generalize +beyond the training sets. + +
+
+
+
+
+ + ☆ Fake News Detection: It's All in the Data! + + +
+ This comprehensive survey serves as an indispensable resource for researchers +embarking on the journey of fake news detection. By highlighting the pivotal +role of dataset quality and diversity, it underscores the significance of these +elements in the effectiveness and robustness of detection models. The survey +meticulously outlines the key features of datasets, various labeling systems +employed, and prevalent biases that can impact model performance. Additionally, +it addresses critical ethical issues and best practices, offering a thorough +overview of the current state of available datasets. Our contribution to this +field is further enriched by the provision of GitHub repository, which +consolidates publicly accessible datasets into a single, user-friendly portal. +This repository is designed to facilitate and stimulate further research and +development efforts aimed at combating the pervasive issue of fake news. + +
+
+
+
+
+ + ☆ Cost-Effective Proxy Reward Model Construction with On-Policy and Active + Learning + + +
+ Reinforcement learning with human feedback (RLHF), as a widely adopted +approach in current large language model pipelines, is \textit{bottlenecked by +the size of human preference data}. While traditional methods rely on offline +preference dataset constructions, recent approaches have shifted towards online +settings, where a learner uses a small amount of labeled seed data and a large +pool of unlabeled prompts to iteratively construct new preference data through +self-generated responses and high-quality reward/preference feedback. However, +most current online algorithms still focus on preference labeling during policy +model updating with given feedback oracles, which incurs significant expert +query costs. \textit{We are the first to explore cost-effective proxy reward +oracles construction strategies for further labeling preferences or rewards +with extremely limited labeled data and expert query budgets}. Our approach +introduces two key innovations: (1) on-policy query to avoid OOD and imbalance +issues in seed data, and (2) active learning to select the most informative +data for preference queries. Using these methods, we train a evaluation model +with minimal expert-labeled data, which then effectively labels nine times more +preference pairs for further RLHF training. For instance, our model using +Direct Preference Optimization (DPO) gains around over 1% average improvement +on AlpacaEval2, MMLU-5shot and MMLU-0shot, with only 1.7K query cost. Our +methodology is orthogonal to other direct expert query-based strategies and +therefore might be integrated with them to further reduce query costs. + +
+
+
+
+
+ + ☆ Breaking Language Barriers: Cross-Lingual Continual Pre-Training at + Scale + + +
+ In recent years, Large Language Models (LLMs) have made significant strides +towards Artificial General Intelligence. However, training these models from +scratch requires substantial computational resources and vast amounts of text +data. In this paper, we explore an alternative approach to constructing an LLM +for a new language by continually pretraining (CPT) from existing pretrained +LLMs, instead of using randomly initialized parameters. Based on parallel +experiments on 40 model sizes ranging from 40M to 5B parameters, we find that +1) CPT converges faster and saves significant resources in a scalable manner; +2) CPT adheres to an extended scaling law derived from Hoffmann et al. (2022) +with a joint data-parameter scaling term; 3) The compute-optimal data-parameter +allocation for CPT markedly differs based on our estimated scaling factors; 4) +The effectiveness of transfer at scale is influenced by training duration and +linguistic properties, while robust to data replaying, a method that +effectively mitigates catastrophic forgetting in CPT. We hope our findings +provide deeper insights into the transferability of LLMs at scale for the +research community. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Helpful assistant or fruitful facilitator? Investigating how personas + affect language model behavior + + +
+ One way to personalize and steer generations from large language models (LLM) +is to assign a persona: a role that describes how the user expects the LLM to +behave (e.g., a helpful assistant, a teacher, a woman). This paper investigates +how personas affect diverse aspects of model behavior. We assign to seven LLMs +162 personas from 12 categories spanning variables like gender, sexual +orientation, and occupation. We prompt them to answer questions from five +datasets covering objective (e.g., questions about math and history) and +subjective tasks (e.g., questions about beliefs and values). We also compare +persona's generations to two baseline settings: a control persona setting with +30 paraphrases of "a helpful assistant" to control for models' prompt +sensitivity, and an empty persona setting where no persona is assigned. We find +that for all models and datasets, personas show greater variability than the +control setting and that some measures of persona behavior generalize across +models. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ☆ Crossroads of Continents: Automated Artifact Extraction for Cultural + Adaptation with Large Multimodal Models + + +
+ In this work, we present a comprehensive three-phase study to examine (1) the +effectiveness of large multimodal models (LMMs) in recognizing cultural +contexts; (2) the accuracy of their representations of diverse cultures; and +(3) their ability to adapt content across cultural boundaries. We first +introduce Dalle Street, a large-scale dataset generated by DALL-E 3 and +validated by humans, containing 9,935 images of 67 countries and 10 concept +classes. We reveal disparities in cultural understanding at the sub-region +level with both open-weight (LLaVA) and closed-source (GPT-4V) models on Dalle +Street and other existing benchmarks. Next, we assess models' deeper culture +understanding by an artifact extraction task and identify over 18,000 artifacts +associated with different countries. Finally, we propose a highly composable +pipeline, CultureAdapt, to adapt images from culture to culture. Our findings +reveal a nuanced picture of the cultural competence of LMMs, highlighting the +need to develop culture-aware systems. Dataset and code are available at +https://github.com/iamshnoo/crossroads + +
+
+ comment: under review +
+
+
+
+
+ + ☆ BiasDora: Exploring Hidden Biased Associations in Vision-Language Models + + +
+ Existing works examining Vision Language Models (VLMs) for social biases +predominantly focus on a limited set of documented bias associations, such as +gender:profession or race:crime. This narrow scope often overlooks a vast range +of unexamined implicit associations, restricting the identification and, hence, +mitigation of such biases. We address this gap by probing VLMs to (1) uncover +hidden, implicit associations across 9 bias dimensions. We systematically +explore diverse input and output modalities and (2) demonstrate how biased +associations vary in their negativity, toxicity, and extremity. Our work (3) +identifies subtle and extreme biases that are typically not recognized by +existing methodologies. We make the Dataset of retrieved associations, (Dora), +publicly available here https://github.com/chahatraj/BiasDora. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Are Data Augmentation Methods in Named Entity Recognition Applicable for + Uncertainty Estimation? + + +
+ This work investigates the impact of data augmentation on confidence +calibration and uncertainty estimation in Named Entity Recognition (NER) tasks. +For the future advance of NER in safety-critical fields like healthcare and +finance, it is essential to achieve accurate predictions with calibrated +confidence when applying Deep Neural Networks (DNNs), including Pre-trained +Language Models (PLMs), as a real-world application. However, DNNs are prone to +miscalibration, which limits their applicability. Moreover, existing methods +for calibration and uncertainty estimation are computational expensive. Our +investigation in NER found that data augmentation improves calibration and +uncertainty in cross-genre and cross-lingual setting, especially in-domain +setting. Furthermore, we showed that the calibration for NER tends to be more +effective when the perplexity of the sentences generated by data augmentation +is lower, and that increasing the size of the augmentation further improves +calibration and uncertainty. + +
+
+
+
+
+ + ☆ Integrate the Essence and Eliminate the Dross: Fine-Grained + Self-Consistency for Free-Form Language Generation ACL2024 + + +
+ Self-consistency (SC), leveraging multiple samples from LLMs, shows +significant gains on various reasoning tasks but struggles with free-form +generation due to the difficulty of aggregating answers. Its variants, UCS and +USC, rely on sample selection or voting mechanisms to improve output quality. +These methods, however, face limitations due to their inability to fully +utilize the nuanced consensus knowledge present within multiple candidate +samples, often resulting in suboptimal outputs. We propose Fine-Grained +Self-Consistency (FSC) to addresses these limitations by extracting and +integrating segment-level commonalities from candidate samples, enhancing the +performance of LLMs both in open-ended and reasoning tasks. Based on this, we +present two additional strategies: candidate filtering, which enhances overall +quality by identifying highly similar candidate sets, and merging, which +reduces input token requirements by combining similar samples. The +effectiveness of FSC is demonstrated through extensive experiments on various +tasks, including summarization, code generation, and mathematical reasoning, +using GPT-3.5-turbo and GPT-4. The results indicate significant improvements +over baseline methods, showcasing the potential of FSC to optimize output +quality by effectively synthesizing fine-grained consensus knowledge from +multiple samples. + +
+
+ comment: Accepted to ACL2024 Main Conference +
+
+
+
+
+ + ☆ Accompanied Singing Voice Synthesis with Fully Text-controlled Melody + + +
+ Text-to-song (TTSong) is a music generation task that synthesizes accompanied +singing voices. Current TTSong methods, inherited from singing voice synthesis +(SVS), require melody-related information that can sometimes be impractical, +such as music scores or MIDI sequences. We present MelodyLM, the first TTSong +model that generates high-quality song pieces with fully text-controlled +melodies, achieving minimal user requirements and maximum control flexibility. +MelodyLM explicitly models MIDI as the intermediate melody-related feature and +sequentially generates vocal tracks in a language model manner, conditioned on +textual and vocal prompts. The accompaniment music is subsequently synthesized +by a latent diffusion model with hybrid conditioning for temporal alignment. +With minimal requirements, users only need to input lyrics and a reference +voice to synthesize a song sample. For full control, just input textual prompts +or even directly input MIDI. Experimental results indicate that MelodyLM +achieves superior performance in terms of both objective and subjective +metrics. Audio samples are available at https://melodylm666.github.io. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ Concise and Precise Context Compression for Tool-Using Language Models + + +
+ Through reading the documentation in the context, tool-using language models +can dynamically extend their capability using external tools. The cost is that +we have to input lengthy documentation every time the model needs to use the +tool, occupying the input window as well as slowing down the decoding process. + Given the progress in general-purpose compression, soft context compression +is a suitable approach to alleviate the problem. However, when compressing tool +documentation, existing methods suffer from the weaknesses of key information +loss (specifically, tool/parameter name errors) and difficulty in adjusting the +length of compressed sequences based on documentation lengths. + To address these problems, we propose two strategies for compressing tool +documentation into concise and precise summary sequences for tool-using +language models. 1) Selective compression strategy mitigates key information +loss by deliberately retaining key information as raw text tokens. 2) Block +compression strategy involves dividing tool documentation into short chunks and +then employing a fixed-length compression model to achieve variable-length +compression. This strategy facilitates the flexible adjustment of the +compression ratio. + Results on API-Bank and APIBench show that our approach reaches a performance +comparable to the upper-bound baseline under up to 16x compression ratio. + +
+
+
+
+
+ + ☆ Fake News Detection and Manipulation Reasoning via Large Vision-Language + Models + + +
+ Fake news becomes a growing threat to information security and public opinion +with the rapid sprawl of media manipulation. Therefore, fake news detection +attracts widespread attention from academic community. Traditional fake news +detection models demonstrate remarkable performance on authenticity binary +classification but their ability to reason detailed faked traces based on the +news content remains under-explored. Furthermore, due to the lack of external +knowledge, the performance of existing methods on fact-related news is +questionable, leaving their practical implementation unclear. In this paper, we +propose a new multi-media research topic, namely manipulation reasoning. +Manipulation reasoning aims to reason manipulations based on news content. To +support the research, we introduce a benchmark for fake news detection and +manipulation reasoning, referred to as Human-centric and Fact-related Fake News +(HFFN). The benchmark highlights the centrality of human and the high factual +relevance, with detailed manual annotations. HFFN encompasses four realistic +domains with fake news samples generated through three manipulation approaches. +Moreover, a Multi-modal news Detection and Reasoning langUage Model (M-DRUM) is +presented not only to judge on the authenticity of multi-modal news, but also +raise analytical reasoning about potential manipulations. On the feature +extraction level, a cross-attention mechanism is employed to extract +fine-grained fusion features from multi-modal inputs. On the reasoning level, a +large vision-language model (LVLM) serves as the backbone to facilitate +fact-related reasoning. A two-stage training framework is deployed to better +activate the capacity of identification and reasoning. Comprehensive +experiments demonstrate that our model outperforms state-of-the-art (SOTA) fake +news detection models and powerful LVLMs like GPT-4 and LLaVA. + +
+
+
+
+
+ + ☆ Prompt Stability Scoring for Text Annotation with Large Language Models + + +
+ Researchers are increasingly using language models (LMs) for text annotation. +These approaches rely only on a prompt telling the model to return a given +output according to a set of instructions. The reproducibility of LM outputs +may nonetheless be vulnerable to small changes in the prompt design. This calls +into question the replicability of classification routines. To tackle this +problem, researchers have typically tested a variety of semantically similar +prompts to determine what we call "prompt stability." These approaches remain +ad-hoc and task specific. In this article, we propose a general framework for +diagnosing prompt stability by adapting traditional approaches to intra- and +inter-coder reliability scoring. We call the resulting metric the Prompt +Stability Score (PSS) and provide a Python package PromptStability for its +estimation. Using six different datasets and twelve outcomes, we classify >150k +rows of data to: a) diagnose when prompt stability is low; and b) demonstrate +the functionality of the package. We conclude by providing best practice +recommendations for applied researchers. + +
+
+ comment: 33 pages, 4 figures +
+
+
+
+
+ + ☆ Breaking Bias, Building Bridges: Evaluation and Mitigation of Social + Biases in LLMs via Contact Hypothesis + + +
+ Large Language Models (LLMs) perpetuate social biases, reflecting prejudices +in their training data and reinforcing societal stereotypes and inequalities. +Our work explores the potential of the Contact Hypothesis, a concept from +social psychology for debiasing LLMs. We simulate various forms of social +contact through LLM prompting to measure their influence on the model's biases, +mirroring how intergroup interactions can reduce prejudices in social contexts. +We create a dataset of 108,000 prompts following a principled approach +replicating social contact to measure biases in three LLMs (LLaMA 2, Tulu, and +NousHermes) across 13 social bias dimensions. We propose a unique debiasing +technique, Social Contact Debiasing (SCD), that instruction-tunes these models +with unbiased responses to prompts. Our research demonstrates that LLM +responses exhibit social biases when subject to contact probing, but more +importantly, these biases can be significantly reduced by up to 40% in 1 epoch +of instruction tuning LLaMA 2 following our SCD strategy. Our code and data are +available at https://github.com/chahatraj/breakingbias. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Why does in-context learning fail sometimes? Evaluating in-context + learning on open and closed questions + + +
+ We measure the performance of in-context learning as a function of task +novelty and difficulty for open and closed questions. For that purpose, we +created a novel benchmark consisting of hard scientific questions, each paired +with a context of various relevancy. We show that counter-intuitively, a +context that is more aligned with the topic does not always help more than a +less relevant context. This effect is especially visible for open questions and +questions of high difficulty or novelty. This result reveals a fundamental +difference between the treatment of close-form and open-form questions by +large-language models and shows a need for a more robust evaluation of +in-context learning on the variety of different types of questions. It also +poses a new question of how to optimally select a context for large language +models, especially in the context of Retrieval Augmented Generation (RAG) +systems. Our results suggest that the answer to this question can be highly +application-dependent and might be contingent on factors including the format +of the question, the perceived difficulty level of the questions, and the +novelty or popularity of the information we seek. + +
+
+ comment: 8 pages plus references, 4 main figures, 6 pages of supplementary + material +
+
+
+
+
+ + ☆ An End-to-End Speech Summarization Using Large Language Model + + +
+ Abstractive Speech Summarization (SSum) aims to generate human-like text +summaries from spoken content. It encounters difficulties in handling long +speech input and capturing the intricate cross-modal mapping between long +speech inputs and short text summaries. Research on large language models +(LLMs) and multimodal information fusion has provided new insights for +addressing these challenges. In this paper, we propose an end-to-end SSum model +that utilizes Q-Former as a connector for the audio-text modality and employs +LLMs to generate text summaries directly from speech features. We adopt a +multi-stage training approach that includes LLM based ASR and Text +Summarization (TSum) tasks as auxiliary tasks. ASR tasks are used to align +feature spaces and enhance the LLM's ability to handle longer speech. Then, we +utilize a curriculum learning strategy to facilitate the model's transition +from TSum to SSum. Finally, our model achieves competitive performance on the +How-2 dataset. + +
+
+ comment: InterSpeech 2024 +
+
+
+
+
+ + ☆ Simple Augmentations of Logical Rules for Neuro-Symbolic Knowledge Graph + Completion ACL 2023 + + +
+ High-quality and high-coverage rule sets are imperative to the success of +Neuro-Symbolic Knowledge Graph Completion (NS-KGC) models, because they form +the basis of all symbolic inferences. Recent literature builds neural models +for generating rule sets, however, preliminary experiments show that they +struggle with maintaining high coverage. In this work, we suggest three simple +augmentations to existing rule sets: (1) transforming rules to their abductive +forms, (2) generating equivalent rules that use inverse forms of constituent +relations and (3) random walks that propose new rules. Finally, we prune +potentially low quality rules. Experiments over four datasets and five +ruleset-baseline settings suggest that these simple augmentations consistently +improve results, and obtain up to 7.1 pt MRR and 8.5 pt Hits@1 gains over using +rules without augmentations. + +
+
+ comment: 12 pages, 15 tables Published in ACL 2023 +
+
+
+
+
+ + ☆ Is Your Large Language Model Knowledgeable or a Choices-Only Cheater? ACL 2024 + + +
+ Recent work shows that large language models (LLMs) can answer +multiple-choice questions using only the choices, but does this mean that MCQA +leaderboard rankings of LLMs are largely influenced by abilities in +choices-only settings? To answer this, we use a contrast set that probes if +LLMs over-rely on choices-only shortcuts in MCQA. While previous works build +contrast sets via expensive human annotations or model-generated data which can +be biased, we employ graph mining to extract contrast sets from existing MCQA +datasets. We use our method on UnifiedQA, a group of six commonsense reasoning +datasets with high choices-only accuracy, to build an 820-question contrast +set. After validating our contrast set, we test 12 LLMs, finding that these +models do not exhibit reliance on choice-only shortcuts when given both the +question and choices. Thus, despite the susceptibility~of MCQA to high +choices-only accuracy, we argue that LLMs are not obtaining high ranks on MCQA +leaderboards just due to their ability to exploit choices-only shortcuts. + +
+
+ comment: KnowledgeLM Workshop @ ACL 2024 +
+
+
+
+
+ + ☆ A Bounding Box is Worth One Token: Interleaving Layout and Text in a + Large Language Model for Document Understanding + + +
+ Recently, many studies have demonstrated that exclusively incorporating +OCR-derived text and spatial layouts with large language models (LLMs) can be +highly effective for document understanding tasks. However, existing methods +that integrate spatial layouts with text have limitations, such as producing +overly long text sequences or failing to fully leverage the autoregressive +traits of LLMs. In this work, we introduce Interleaving Layout and Text in a +Large Language Model (LayTextLLM)} for document understanding. In particular, +LayTextLLM projects each bounding box to a single embedding and interleaves it +with text, efficiently avoiding long sequence issues while leveraging +autoregressive traits of LLMs. LayTextLLM not only streamlines the interaction +of layout and textual data but also shows enhanced performance in Key +Information Extraction (KIE) and Visual Question Answering (VQA). Comprehensive +benchmark evaluations reveal significant improvements, with a 27.0% increase on +KIE tasks and 24.1% on VQA tasks compared to previous state-of-the-art document +understanding MLLMs, as well as a 15.5% improvement over other SOTA OCR-based +LLMs on KIE tasks. + +
+
+
+
+
+ + ☆ AdaCQR: Enhancing Query Reformulation for Conversational Search via + Sparse and Dense Retrieval Alignment + + +
+ Conversational Query Reformulation (CQR) has significantly advanced in +addressing the challenges of conversational search, particularly those stemming +from the latent user intent and the need for historical context. Recent works +aimed to boost the performance of CRQ through alignment. However, they are +designed for one specific retrieval system, which potentially results in poor +generalization. To overcome this limitation, we present a novel framework +AdaCQR. By aligning reformulation models with both term-based and +semantic-based retrieval systems, AdaCQR enhances the generalizability of +information-seeking queries across diverse retrieval environments through a +dual-phase training strategy. We also developed two effective approaches for +acquiring superior labels and diverse input candidates, boosting the efficiency +and robustness of the framework. Experimental evaluations on the TopiOCQA and +QReCC datasets demonstrate that AdaCQR significantly outperforms existing +methods, offering both quantitative and qualitative improvements in +conversational query reformulation. + +
+
+
+
+
+ + ☆ Enabling Discriminative Reasoning in Large Language Models for Legal + Judgment Prediction + + +
+ Legal judgment prediction is essential for enhancing judicial efficiency. In +this work, we identify that existing large language models (LLMs) underperform +in this domain due to challenges in understanding case complexities and +distinguishing between similar charges. To adapt LLMs for effective legal +judgment prediction, we introduce the Ask-Discriminate-Predict (ADAPT) +reasoning framework inspired by human judicial reasoning. ADAPT involves +decomposing case facts, discriminating among potential charges, and predicting +the final judgment. We further enhance LLMs through fine-tuning with multi-task +synthetic trajectories to improve legal judgment prediction accuracy and +efficiency under our ADAPT framework. Extensive experiments conducted on two +widely-used datasets demonstrate the superior performance of our framework in +legal judgment prediction, particularly when dealing with complex and confusing +charges. + +
+
+
+
+
+ + ☆ S2D: Sorted Speculative Decoding For More Efficient Deployment of Nested + Large Language Models + + +
+ Deployment of autoregressive large language models (LLMs) is costly, and as +these models increase in size, the associated costs will become even more +considerable. Consequently, different methods have been proposed to accelerate +the token generation process and reduce costs. Speculative decoding (SD) is +among the most promising approaches to speed up the LLM decoding process by +verifying multiple tokens in parallel and using an auxiliary smaller draft +model to generate the possible tokens. In SD, usually, one draft model is used +to serve a specific target model; however, in practice, LLMs are diverse, and +we might need to deal with many target models or more than one target model +simultaneously. In this scenario, it is not clear which draft model should be +used for which target model, and searching among different draft models or +training customized draft models can further increase deployment costs. In this +paper, we first introduce a novel multi-target scenario for the deployment of +draft models for faster inference. Then, we present a novel, more efficient +sorted speculative decoding mechanism that outperforms regular baselines in +multi-target settings. We evaluated our method on Spec-Bench in different +settings, including base models such as Vicuna 7B, 13B, and LLama Chat 70B. Our +results suggest that our draft models perform better than baselines for +multiple target models at the same time. + +
+
+
+
+
+ + ☆ Extracting and Encoding: Leveraging Large Language Models and Medical + Knowledge to Enhance Radiological Text Representation ACL 2024 + + +
+ Advancing representation learning in specialized fields like medicine remains +challenging due to the scarcity of expert annotations for text and images. To +tackle this issue, we present a novel two-stage framework designed to extract +high-quality factual statements from free-text radiology reports in order to +improve the representations of text encoders and, consequently, their +performance on various downstream tasks. In the first stage, we propose a +\textit{Fact Extractor} that leverages large language models (LLMs) to identify +factual statements from well-curated domain-specific datasets. In the second +stage, we introduce a \textit{Fact Encoder} (CXRFE) based on a BERT model +fine-tuned with objective functions designed to improve its representations +using the extracted factual data. Our framework also includes a new +embedding-based metric (CXRFEScore) for evaluating chest X-ray text generation +systems, leveraging both stages of our approach. Extensive evaluations show +that our fact extractor and encoder outperform current state-of-the-art methods +in tasks such as sentence ranking, natural language inference, and label +extraction from radiology reports. Additionally, our metric proves to be more +robust and effective than existing metrics commonly used in the radiology +report generation literature. The code of this project is available at +\url{https://github.com/PabloMessina/CXR-Fact-Encoder}. + +
+
+ comment: Accepted to ACL 2024 (Findings) +
+
+
+
+
+ + ☆ Certainly Uncertain: A Benchmark and Metric for Multimodal Epistemic and + Aleatoric Awareness + + +
+ The ability to acknowledge the inevitable uncertainty in their knowledge and +reasoning is a prerequisite for AI systems to be truly truthful and reliable. +In this paper, we present a taxonomy of uncertainty specific to vision-language +AI systems, distinguishing between epistemic uncertainty (arising from a lack +of information) and aleatoric uncertainty (due to inherent unpredictability), +and further explore finer categories within. Based on this taxonomy, we +synthesize a benchmark dataset, CertainlyUncertain, featuring 178K visual +question answering (VQA) samples as contrastive pairs. This is achieved by 1) +inpainting images to make previously answerable questions into unanswerable +ones; and 2) using image captions to prompt large language models for both +answerable and unanswerable questions. Additionally, we introduce a new metric +confidence-weighted accuracy, that is well correlated with both accuracy and +calibration error, to address the shortcomings of existing metrics. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Efficient-Empathy: Towards Efficient and Effective Selection of Empathy + Data + + +
+ In recent years, with the rapid advancements in large language models (LLMs), +achieving excellent empathetic response capability has become a crucial +prerequisite. Consequently, managing and understanding large-scale video +datasets has gained increasing importance. However, empathetic data are +typically trained without any quality selection, leading to inefficient data +usage and wasted computational resources. Additionally, using raw data can +result in low performance in empathetic dialogues. In this work, we present +Efficient-Empathy, a sensibility and rationality score-based data selection +algorithm that automatically selects sensibility and rationality data while +discarding low-quality data. With only the sensibility data (59% of the full +dataset), our trained sensibility model efficiently achieves state-of-the-art +(SoTA) performance. Furthermore, with multiple data selection hyperparameters, +the sensibility model demonstrates SoTA performance, showcasing the robustness +of our method. By integrating sensibility and rationality data with a MoE +structure, we achieve even higher performance, demonstrating the effectiveness +of our Efficient-Empathy algorithm. + +
+
+
+
+
+ + ☆ What We Talk About When We Talk About LMs: Implicit Paradigm Shifts and + the Ship of Language Models + + +
+ The term Language Models (LMs), as a time-specific collection of models of +interest, is constantly reinvented, with its referents updated much like the +$\textit{Ship of Theseus}$ replaces its parts but remains the same ship in +essence. In this paper, we investigate this $\textit{Ship of Language Models}$ +problem, wherein scientific evolution takes the form of continuous, implicit +retrofits of key existing terms. We seek to initiate a novel perspective of +scientific progress, in addition to the more well-studied emergence of new +terms. To this end, we construct the data infrastructure based on recent NLP +publications. Then, we perform a series of text-based analyses toward a +detailed, quantitative understanding of the use of Language Models as a term of +art. Our work highlights how systems and theories influence each other in +scientific discourse, and we call for attention to the transformation of this +Ship that we all are contributing to. + +
+
+
+
+
+ + ☆ To Forget or Not? Towards Practical Knowledge Unlearning for Large + Language Models + + +
+ Large Language Models (LLMs) trained on extensive corpora inevitably retain +sensitive data, such as personal privacy information and copyrighted material. +Recent advancements in knowledge unlearning involve updating LLM parameters to +erase specific knowledge. However, current unlearning paradigms are mired in +vague forgetting boundaries, often erasing knowledge indiscriminately. In this +work, we introduce KnowUnDo, a benchmark containing copyrighted content and +user privacy domains to evaluate if the unlearning process inadvertently erases +essential knowledge. Our findings indicate that existing unlearning methods +often suffer from excessive unlearning. To address this, we propose a simple +yet effective method, MemFlex, which utilizes gradient information to precisely +target and unlearn sensitive parameters. Experimental results show that MemFlex +is superior to existing methods in both precise knowledge unlearning and +general knowledge retaining of LLMs. Code and dataset will be released at +https://github.com/zjunlp/KnowUnDo. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Investigating the Effects of Large-Scale Pseudo-Stereo Data and + Different Speech Foundation Model on Dialogue Generative Spoken Language + Model + + +
+ Recent efforts in Spoken Dialogue Modeling aim to synthesize spoken dialogue +without the need for direct transcription, thereby preserving the wealth of +non-textual information inherent in speech. However, this approach faces a +challenge when speakers talk simultaneously, requiring stereo dialogue data +with speakers recorded on separate channels, a notably scarce resource. To +address this, we have developed an innovative pipeline capable of transforming +single-channel dialogue data into pseudo-stereo data. This expanded our +training dataset from a mere 2,000 to an impressive 17,600 hours, significantly +enriching the diversity and quality of the training examples available. The +inclusion of this pseudo-stereo data has proven to be effective in improving +the performance of spoken dialogue language models. Additionally, we explored +the use of discrete units of different speech foundation models for spoken +dialogue generation. + +
+
+ comment: submitted to interspeech 2024 +
+
+
+
+
+ + ☆ Pinyin Regularization in Error Correction for Chinese Speech Recognition + with Large Language Models + + +
+ Recent studies have demonstrated the efficacy of large language models (LLMs) +in error correction for automatic speech recognition (ASR). However, much of +the research focuses on the English language. This paper redirects the +attention to Chinese. Firstly, we construct a specialized benchmark dataset +aimed at error correction for Chinese ASR with 724K hypotheses-transcription +pairs, named the Chinese Hypotheses Paradise dataset (ChineseHP), which +contains a wide range of scenarios and presents significant challenges. +Subsequently, we conduct a preliminary evaluation using the dataset for both +direct-prompting and fine-tuning pre-trained LLMs. Furthermore, we propose a +straightforward method of Pinyin regularization for prompts, which involves the +transcription of Pinyin directly from text hypotheses. The experimental results +reveal that Pinyin regularization consistently enhances the error-correcting +ability of LLMs when compared with those without regularization. The dataset is +available on the website. + +
+
+ comment: Interspeech 2024 +
+
+
+
+
+ + ☆ Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning for + Sparse Architectural Large Language Models + + +
+ Parameter-efficient fine-tuning (PEFT) is crucial for customizing Large +Language Models (LLMs) with constrained resources. Although there have been +various PEFT methods for dense-architecture LLMs, PEFT for sparse-architecture +LLMs is still underexplored. In this work, we study the PEFT method for LLMs +with the Mixture-of-Experts (MoE) architecture and the contents of this work +are mainly threefold: (1) We investigate the dispersion degree of the activated +experts in customized tasks, and found that the routing distribution for a +specific task tends to be highly concentrated, while the distribution of +activated experts varies significantly across different tasks. (2) We propose +Expert-Specialized Fine-Tuning, or ESFT, which tunes the experts most relevant +to downstream tasks while freezing the other experts and modules; experimental +results demonstrate that our method not only improves the tuning efficiency, +but also matches or even surpasses the performance of full-parameter +fine-tuning. (3) We further analyze the impact of the MoE architecture on +expert-specialized fine-tuning. We find that MoE models with finer-grained +experts are more advantageous in selecting the combination of experts that are +most relevant to downstream tasks, thereby enhancing both the training +efficiency and effectiveness. + +
+
+
+
+
+ + ☆ SoP: Unlock the Power of Social Facilitation for Automatic Jailbreak + Attack + + +
+ The widespread applications of large language models (LLMs) have brought +about concerns regarding their potential misuse. Although aligned with human +preference data before release, LLMs remain vulnerable to various malicious +attacks. In this paper, we adopt a red-teaming strategy to enhance LLM safety +and introduce SoP, a simple yet effective framework to design jailbreak prompts +automatically. Inspired by the social facilitation concept, SoP generates and +optimizes multiple jailbreak characters to bypass the guardrails of the target +LLM. Different from previous work which relies on proprietary LLMs or seed +jailbreak templates crafted by human expertise, SoP can generate and optimize +the jailbreak prompt in a cold-start scenario using open-sourced LLMs without +any seed jailbreak templates. Experimental results show that SoP achieves +attack success rates of 88% and 60% in bypassing the safety alignment of +GPT-3.5-1106 and GPT-4, respectively. Furthermore, we extensively evaluate the +transferability of the generated templates across different LLMs and held-out +malicious requests, while also exploring defense strategies against the +jailbreak attack designed by SoP. Code is available at +https://github.com/Yang-Yan-Yang-Yan/SoP. + +
+
+
+
+
+ + ☆ Scope-enhanced Compositional Semantic Parsing for DRT + + +
+ Discourse Representation Theory (DRT) distinguishes itself from other +semantic representation frameworks by its ability to model complex semantic and +discourse phenomena through structural nesting and variable binding. While +seq2seq models hold the state of the art on DRT parsing, their accuracy +degrades with the complexity of the sentence, and they sometimes struggle to +produce well-formed DRT representations. We introduce the AMS parser, a +compositional, neurosymbolic semantic parser for DRT. It rests on a novel +mechanism for predicting quantifier scope. We show that the AMS parser reliably +produces well-formed outputs and performs well on DRT parsing, especially on +complex sentences. + +
+
+
+
+
+ + ☆ Proposal Report for the 2nd SciCAP Competition 2024 + + +
+ In this paper, we propose a method for document summarization using auxiliary +information. This approach effectively summarizes descriptions related to +specific images, tables, and appendices within lengthy texts. Our experiments +demonstrate that leveraging high-quality OCR data and initially extracted +information from the original text enables efficient summarization of the +content related to described objects. Based on these findings, we enhanced +popular text generation model models by incorporating additional auxiliary +branches to improve summarization performance. Our method achieved top scores +of 4.33 and 4.66 in the long caption and short caption tracks, respectively, of +the 2024 SciCAP competition, ranking highest in both categories. + +
+
+
+
+
+ + ☆ LogEval: A Comprehensive Benchmark Suite for Large Language Models In + Log Analysis + + +
+ Log analysis is crucial for ensuring the orderly and stable operation of +information systems, particularly in the field of Artificial Intelligence for +IT Operations (AIOps). Large Language Models (LLMs) have demonstrated +significant potential in natural language processing tasks. In the AIOps +domain, they excel in tasks such as anomaly detection, root cause analysis of +faults, operations and maintenance script generation, and alert information +summarization. However, the performance of current LLMs in log analysis tasks +remains inadequately validated. To address this gap, we introduce LogEval, a +comprehensive benchmark suite designed to evaluate the capabilities of LLMs in +various log analysis tasks for the first time. This benchmark covers tasks such +as log parsing, log anomaly detection, log fault diagnosis, and log +summarization. LogEval evaluates each task using 4,000 publicly available log +data entries and employs 15 different prompts for each task to ensure a +thorough and fair assessment. By rigorously evaluating leading LLMs, we +demonstrate the impact of various LLM technologies on log analysis performance, +focusing on aspects such as self-consistency and few-shot contextual learning. +We also discuss findings related to model quantification, Chinese-English +question-answering evaluation, and prompt engineering. These findings provide +insights into the strengths and weaknesses of LLMs in multilingual environments +and the effectiveness of different prompt strategies. Various evaluation +methods are employed for different tasks to accurately measure the performance +of LLMs in log analysis, ensuring a comprehensive assessment. The insights +gained from LogEvals evaluation reveal the strengths and limitations of LLMs in +log analysis tasks, providing valuable guidance for researchers and +practitioners. + +
+
+
+
+
+ + ☆ GRASP: A Grid-Based Benchmark for Evaluating Commonsense Spatial + Reasoning + + +
+ Spatial reasoning, an important faculty of human cognition with many +practical applications, is one of the core commonsense skills that is not +purely language-based and, for satisfying (as opposed to optimal) solutions, +requires some minimum degree of planning. Existing benchmarks of Commonsense +Spatial Reasoning (CSR) tend to evaluate how Large Language Models (LLMs) +interpret text-based spatial descriptions rather than directly evaluate a plan +produced by the LLM in response to a spatial reasoning scenario. In this paper, +we construct a large-scale benchmark called $\textbf{GRASP}$, which consists of +16,000 grid-based environments where the agent is tasked with an energy +collection problem. These environments include 100 grid instances instantiated +using each of the 160 different grid settings, involving five different energy +distributions, two modes of agent starting position, and two distinct obstacle +configurations, as well as three kinds of agent constraints. Using GRASP, we +compare classic baseline approaches, such as random walk and greedy search +methods, with advanced LLMs like GPT-3.5-Turbo and GPT-4o. The experimental +results indicate that even these advanced LLMs struggle to consistently achieve +satisfactory solutions. + +
+
+
+
+
+ + ☆ Beyond Numeric Awards: In-Context Dueling Bandits with LLM Agents + + +
+ In-context decision-making is an important capability of artificial general +intelligence, which Large Language Models (LLMs) have effectively demonstrated +in various scenarios. However, LLMs often face challenges when dealing with +numerical contexts, and limited attention has been paid to evaluating their +performance through preference feedback generated by the environment. This +paper investigates the performance of LLMs as decision-makers in the context of +Dueling Bandits (DB). We first evaluate the performance of LLMs by comparing +GPT-3.5-Turbo, GPT-4, and GPT-4-Turbo against established DB algorithms. Our +results reveal that LLMs, particularly GPT-4 Turbo, quickly identify the +Condorcet winner, thus outperforming existing state-of-the-art algorithms in +terms of weak regret. Nevertheless, LLMs struggle to converge even when +explicitly prompted to do so, and are sensitive to prompt variations. To +overcome these issues, we introduce an LLM-augmented algorithm, IF-Enhanced +LLM, which takes advantage of both in-context decision-making capabilities of +LLMs and theoretical guarantees inherited from classic DB algorithms. The +design of such an algorithm sheds light on how to enhance trustworthiness for +LLMs used in decision-making tasks where performance robustness matters. We +show that IF-Enhanced LLM has theoretical guarantees on both weak and strong +regret. Our experimental results validate that IF-Enhanced LLM is robust even +with noisy and adversarial prompts. + +
+
+
+
+
+ + ☆ Survey on Knowledge Distillation for Large Language Models: Methods, + Evaluation, and Application + + +
+ Large Language Models (LLMs) have showcased exceptional capabilities in +various domains, attracting significant interest from both academia and +industry. Despite their impressive performance, the substantial size and +computational demands of LLMs pose considerable challenges for practical +deployment, particularly in environments with limited resources. The endeavor +to compress language models while maintaining their accuracy has become a focal +point of research. Among the various methods, knowledge distillation has +emerged as an effective technique to enhance inference speed without greatly +compromising performance. This paper presents a thorough survey from three +aspects: method, evaluation, and application, exploring knowledge distillation +techniques tailored specifically for LLMs. Specifically, we divide the methods +into white-box KD and black-box KD to better illustrate their differences. +Furthermore, we also explored the evaluation tasks and distillation effects +between different distillation methods, and proposed directions for future +research. Through in-depth understanding of the latest advancements and +practical applications, this survey provides valuable resources for +researchers, paving the way for sustained progress in this field. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ Compare without Despair: Reliable Preference Evaluation with Generation + Separability + + +
+ Human evaluation of generated language through pairwise preference judgments +is pervasive. However, under common scenarios, such as when generations from a +model pair are very similar, or when stochastic decoding results in large +variations in generations, it results in inconsistent preference ratings. We +address these challenges by introducing a meta-evaluation measure, +separability, which estimates how suitable a test instance is for pairwise +preference evaluation. For a candidate test instance, separability samples +multiple generations from a pair of models, and measures how distinguishable +the two sets of generations are. Our experiments show that instances with high +separability values yield more consistent preference ratings from both human- +and auto-raters. Further, the distribution of separability allows insights into +which test benchmarks are more valuable for comparing models. Finally, we +incorporate separability into ELO ratings, accounting for how suitable each +test instance might be for reliably ranking LLMs. Overall, separability has +implications for consistent, efficient and robust preference evaluation of LLMs +with both human- and auto-raters. + +
+
+
+
+
+ + ☆ Automated Text Scoring in the Age of Generative AI for the GPU-poor + + +
+ Current research on generative language models (GLMs) for automated text +scoring (ATS) has focused almost exclusively on querying proprietary models via +Application Programming Interfaces (APIs). Yet such practices raise issues +around transparency and security, and these methods offer little in the way of +efficiency or customizability. With the recent proliferation of smaller, +open-source models, there is the option to explore GLMs with computers equipped +with modest, consumer-grade hardware, that is, for the "GPU poor." In this +study, we analyze the performance and efficiency of open-source, small-scale +GLMs for ATS. Results show that GLMs can be fine-tuned to achieve adequate, +though not state-of-the-art, performance. In addition to ATS, we take small +steps towards analyzing models' capacity for generating feedback by prompting +GLMs to explain their scores. Model-generated feedback shows promise, but +requires more rigorous evaluation focused on targeted use cases. + +
+
+ comment: 21 pages, 1 figure +
+
+
+
+
+ + ☆ VSP: Assessing the dual challenges of perception and reasoning in + spatial planning tasks for VLMs + + +
+ Vision language models (VLMs) are an exciting emerging class of language +models (LMs) that have merged classic LM capabilities with those of image +processing systems. However, the ways that these capabilities combine are not +always intuitive and warrant direct investigation. One understudied capability +in VLMs is visual spatial planning -- the ability to comprehend the spatial +arrangements of objects and devise action plans to achieve desired outcomes in +visual scenes. In our study, we introduce VSP, a benchmark that 1) evaluates +the spatial planning capability in these models in general, and 2) breaks down +the visual planning task into finer-grained sub-tasks, including perception and +reasoning, and measure the LMs capabilities in these sub-tasks. Our evaluation +shows that both open-source and private VLMs fail to generate effective plans +for even simple spatial planning tasks. Evaluations on the fine-grained +analytical tasks further reveal fundamental deficiencies in the models' visual +perception and bottlenecks in reasoning abilities, explaining their worse +performance in the general spatial planning tasks. Our work illuminates future +directions for improving VLMs' abilities in spatial planning. Our benchmark is +publicly available at +https://github.com/UCSB-NLP-Chang/Visual-Spatial-Planning. + +
+
+
+
+
+ + ☆ LLM-Select: Feature Selection with Large Language Models + + +
+ In this paper, we demonstrate a surprising capability of large language +models (LLMs): given only input feature names and a description of a prediction +task, they are capable of selecting the most predictive features, with +performance rivaling the standard tools of data science. Remarkably, these +models exhibit this capacity across various query mechanisms. For example, we +zero-shot prompt an LLM to output a numerical importance score for a feature +(e.g., "blood pressure") in predicting an outcome of interest (e.g., "heart +failure"), with no additional context. In particular, we find that the latest +models, such as GPT-4, can consistently identify the most predictive features +regardless of the query mechanism and across various prompting strategies. We +illustrate these findings through extensive experiments on real-world data, +where we show that LLM-based feature selection consistently achieves strong +performance competitive with data-driven methods such as the LASSO, despite +never having looked at the downstream training data. Our findings suggest that +LLMs may be useful not only for selecting the best features for training but +also for deciding which features to collect in the first place. This could +potentially benefit practitioners in domains like healthcare, where collecting +high-quality data comes at a high cost. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Reasoning in Large Language Models: A Geometric Perspective + + +
+ The advancement of large language models (LLMs) for real-world applications +hinges critically on enhancing their reasoning capabilities. In this work, we +explore the reasoning abilities of large language models (LLMs) through their +geometrical understanding. We establish a connection between the expressive +power of LLMs and the density of their self-attention graphs. Our analysis +demonstrates that the density of these graphs defines the intrinsic dimension +of the inputs to the MLP blocks. We demonstrate through theoretical analysis +and toy examples that a higher intrinsic dimension implies a greater expressive +capacity of the LLM. We further provide empirical evidence linking this +geometric framework to recent advancements in methods aimed at enhancing the +reasoning capabilities of LLMs. + +
+
+
+
+
+ + ☆ Supporters and Skeptics: LLM-based Analysis of Engagement with Mental + Health (Mis)Information Content on Video-sharing Platforms + + +
+ Over one in five adults in the US lives with a mental illness. In the face of +a shortage of mental health professionals and offline resources, online +short-form video content has grown to serve as a crucial conduit for +disseminating mental health help and resources. However, the ease of content +creation and access also contributes to the spread of misinformation, posing +risks to accurate diagnosis and treatment. Detecting and understanding +engagement with such content is crucial to mitigating their harmful effects on +public health. We perform the first quantitative study of the phenomenon using +YouTube Shorts and Bitchute as the sites of study. We contribute MentalMisinfo, +a novel labeled mental health misinformation (MHMisinfo) dataset of 739 videos +(639 from Youtube and 100 from Bitchute) and 135372 comments in total, using an +expert-driven annotation schema. We first found that few-shot in-context +learning with large language models (LLMs) are effective in detecting MHMisinfo +videos. Next, we discover distinct and potentially alarming linguistic patterns +in how audiences engage with MHMisinfo videos through commentary on both +video-sharing platforms. Across the two platforms, comments could exacerbate +prevailing stigma with some groups showing heightened susceptibility to and +alignment with MHMisinfo. We discuss technical and public health-driven +adaptive solutions to tackling the "epidemic" of mental health misinformation +online. + +
+
+ comment: 12 pages, in submission to ICWSM +
+
+
+
+
+ + ☆ Ensuring Responsible Sourcing of Large Language Model Training Data + Through Knowledge Graph Comparison + + +
+ In light of recent plagiarism allegations Brough by publishers, newspapers, +and other creators of copyrighted corpora against large language model (LLM) +developers, we propose a novel system, a variant of a plagiarism detection +system, that assesses whether a knowledge source has been used in the training +or fine-tuning of a large language model. Unlike current methods, we utilize an +approach that uses Resource Description Framework (RDF) triples to create +knowledge graphs from both a source document and a LLM continuation of that +document. These graphs are then analyzed with respect to content using cosine +similarity and with respect to structure using a normalized version of graph +edit distance that shows the degree of isomorphism. Unlike traditional systems +that focus on content matching and keyword identification between a source and +target corpus, our approach enables a broader evaluation of similarity and thus +a more accurate comparison of the similarity between a source document and LLM +continuation by focusing on relationships between ideas and their organization +with regards to others. Additionally, our approach does not require access to +LLM metrics like perplexity that may be unavailable in closed large language +modeling "black-box" systems, as well as the training corpus. A prototype of +our system will be found on a hyperlinked GitHub repository. + +
+
+
+
+
+ + ☆ A Practical Review of Mechanistic Interpretability for Transformer-Based + Language Models + + +
+ Mechanistic interpretability (MI) is an emerging sub-field of +interpretability that seeks to understand a neural network model by +reverse-engineering its internal computations. Recently, MI has garnered +significant attention for interpreting transformer-based language models (LMs), +resulting in many novel insights yet introducing new challenges. However, there +has not been work that comprehensively reviews these insights and challenges, +particularly as a guide for newcomers to this field. To fill this gap, we +present a comprehensive survey outlining fundamental objects of study in MI, +techniques that have been used for its investigation, approaches for evaluating +MI results, and significant findings and applications stemming from the use of +MI to understand LMs. In particular, we present a roadmap for beginners to +navigate the field and leverage MI for their benefit. Finally, we also identify +current gaps in the field and discuss potential future directions. + +
+
+ comment: 11 pages, 11 figures, Preprint +
+
+
+
+
+ + ☆ Change My Frame: Reframing in the Wild in r/ChangeMyView NAACL 2024 + + +
+ Recent work in reframing, within the scope of text style transfer, has so far +made use of out-of-context, task-prompted utterances in order to produce +neutralizing or optimistic reframes. Our work aims to generalize reframing +based on the subreddit r/ChangeMyView (CMV). We build a dataset that leverages +CMV's community's interactions and conventions to identify high-value, +community-recognized utterances that produce changes of perspective. With this +data, we widen the scope of the direction of reframing since the changes in +perspective do not only occur in neutral or positive directions. We fine tune +transformer-based models, make use of a modern LLM to refine our dataset, and +explore challenges in the dataset creation and evaluation around this type of +reframing. + +
+
+ comment: 3 pages, NAACL 2024 workshop +
+
+
+
+
+ + ☆ Nollywood: Let's Go to the Movies! + + +
+ Nollywood, based on the idea of Bollywood from India, is a series of +outstanding movies that originate from Nigeria. Unfortunately, while the movies +are in English, they are hard to understand for many native speakers due to the +dialect of English that is spoken. In this article, we accomplish two goals: +(1) create a phonetic sub-title model that is able to translate Nigerian +English speech to American English and (2) use the most advanced toxicity +detectors to discover how toxic the speech is. Our aim is to highlight the text +in these videos which is often times ignored for lack of dialectal +understanding due the fact that many people in Nigeria speak a native language +like Hausa at home. + +
+
+ comment: 8 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Uplifting Lower-Income Data: Strategies for Socioeconomic Perspective + Shifts in Vision-Language Models + + +
+ To address this issue, we formulate translated non-English, geographic, and +socioeconomic integrated prompts and evaluate their impact on VL model +performance for data from different countries and income groups. Our findings +show that geographic and socioeconomic integrated prompts improve VL +performance on lower-income data and favor the retrieval of topic appearances +commonly found in data from low-income households. From our analyses, we +identify and highlight contexts where these strategies yield the most +improvements. Our model analysis code is publicly available at +https://github.com/Anniejoan/Uplifting-Lower-income-data . + +
+
+
+
+
+ + ☆ D-Rax: Domain-specific Radiologic assistant leveraging multi-modal data + and eXpert model predictions + + +
+ Large vision language models (VLMs) have progressed incredibly from research +to applicability for general-purpose use cases. LLaVA-Med, a pioneering large +language and vision assistant for biomedicine, can perform multi-modal +biomedical image and data analysis to provide a natural language interface for +radiologists. While it is highly generalizable and works with multi-modal data, +it is currently limited by well-known challenges that exist in the large +language model space. Hallucinations and imprecision in responses can lead to +misdiagnosis which currently hinder the clinical adaptability of VLMs. To +create precise, user-friendly models in healthcare, we propose D-Rax -- a +domain-specific, conversational, radiologic assistance tool that can be used to +gain insights about a particular radiologic image. In this study, we enhance +the conversational analysis of chest X-ray (CXR) images to support radiological +reporting, offering comprehensive insights from medical imaging and aiding in +the formulation of accurate diagnosis. D-Rax is achieved by fine-tuning the +LLaVA-Med architecture on our curated enhanced instruction-following data, +comprising of images, instructions, as well as disease diagnosis and +demographic predictions derived from MIMIC-CXR imaging data, CXR-related visual +question answer (VQA) pairs, and predictive outcomes from multiple expert AI +models. We observe statistically significant improvement in responses when +evaluated for both open and close-ended conversations. Leveraging the power of +state-of-the-art diagnostic models combined with VLMs, D-Rax empowers +clinicians to interact with medical images using natural language, which could +potentially streamline their decision-making process, enhance diagnostic +accuracy, and conserve their time. + +
+
+
+
+
+ + ☆ Towards More Realistic Extraction Attacks: An Adversarial Perspective ACL2024 + + +
+ Language models are prone to memorizing large parts of their training data, +making them vulnerable to extraction attacks. Existing research on these +attacks remains limited in scope, often studying isolated trends rather than +the real-world interactions with these models. In this paper, we revisit +extraction attacks from an adversarial perspective, exploiting the brittleness +of language models. We find significant churn in extraction attack trends, +i.e., even minor, unintuitive changes to the prompt, or targeting smaller +models and older checkpoints, can exacerbate the risks of extraction by up to +$2-4 \times$. Moreover, relying solely on the widely accepted verbatim match +underestimates the extent of extracted information, and we provide various +alternatives to more accurately capture the true risks of extraction. We +conclude our discussion with data deduplication, a commonly suggested +mitigation strategy, and find that while it addresses some memorization +concerns, it remains vulnerable to the same escalation of extraction risks +against a real-world adversary. Our findings highlight the necessity of +acknowledging an adversary's true capabilities to avoid underestimating +extraction risks. + +
+
+ comment: To be presented at PrivateNLP@ACL2024 +
+
+
+
+
+ + ☆ RLHF Can Speak Many Languages: Unlocking Multilingual Preference + Optimization for LLMs + + +
+ Preference optimization techniques have become a standard final stage for +training state-of-art large language models (LLMs). However, despite widespread +adoption, the vast majority of work to-date has focused on first-class citizen +languages like English and Chinese. This captures a small fraction of the +languages in the world, but also makes it unclear which aspects of current +state-of-the-art research transfer to a multilingual setting. In this work, we +perform an exhaustive study to achieve a new state-of-the-art in aligning +multilingual LLMs. We introduce a novel, scalable method for generating +high-quality multilingual feedback data to balance data coverage. We establish +the benefits of cross-lingual transfer and increased dataset size in preference +training. Our preference-trained model achieves a 54.4% win-rate against Aya 23 +8B, the current state-of-the-art multilingual LLM in its parameter class, and a +69.5% win-rate or higher against widely used models like Gemma-1.1-7B-it, +Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.3. As a result of our study, we +expand the frontier of alignment techniques to 23 languages covering half of +the world's population. + +
+
+
+
+
+ + ♻ ☆ Shall We Team Up: Exploring Spontaneous Cooperation of Competing LLM + Agents + + +
+ Large Language Models (LLMs) have increasingly been utilized in social +simulations, where they are often guided by carefully crafted instructions to +stably exhibit human-like behaviors during simulations. Nevertheless, we doubt +the necessity of shaping agents' behaviors for accurate social simulations. +Instead, this paper emphasizes the importance of spontaneous phenomena, wherein +agents deeply engage in contexts and make adaptive decisions without explicit +directions. We explored spontaneous cooperation across three competitive +scenarios and successfully simulated the gradual emergence of cooperation, +findings that align closely with human behavioral data. This approach not only +aids the computational social science community in bridging the gap between +simulations and real-world dynamics but also offers the AI community a novel +method to assess LLMs' capability of deliberate reasoning. + +
+
+ comment: Source codes available at + https://github.com/wuzengqing001225/SABM_ShallWeTeamUp +
+
+
+
+
+ + ♻ ☆ Leveraging Large Language Models for Learning Complex Legal Concepts + through Storytelling ACL 2024 + + +
+ Making legal knowledge accessible to non-experts is crucial for enhancing +general legal literacy and encouraging civic participation in democracy. +However, legal documents are often challenging to understand for people without +legal backgrounds. In this paper, we present a novel application of large +language models (LLMs) in legal education to help non-experts learn intricate +legal concepts through storytelling, an effective pedagogical tool in conveying +complex and abstract concepts. We also introduce a new dataset LegalStories, +which consists of 294 complex legal doctrines, each accompanied by a story and +a set of multiple-choice questions generated by LLMs. To construct the dataset, +we experiment with various LLMs to generate legal stories explaining these +concepts. Furthermore, we use an expert-in-the-loop approach to iteratively +design multiple-choice questions. Then, we evaluate the effectiveness of +storytelling with LLMs through randomized controlled trials (RCTs) with legal +novices on 10 samples from the dataset. We find that LLM-generated stories +enhance comprehension of legal concepts and interest in law among non-native +speakers compared to only definitions. Moreover, stories consistently help +participants relate legal concepts to their lives. Finally, we find that +learning with stories shows a higher retention rate for non-native speakers in +the follow-up assessment. Our work has strong implications for using LLMs in +promoting teaching and learning in the legal field and beyond. + +
+
+ comment: Accepted to ACL 2024 +
+
+
+
+
+ + ♻ ☆ Towards Robust Speech Representation Learning for Thousands of Languages + + +
+ Self-supervised learning (SSL) has helped extend speech technologies to more +languages by reducing the need for labeled data. However, models are still far +from supporting the world's 7000+ languages. We propose XEUS, a Cross-lingual +Encoder for Universal Speech, trained on over 1 million hours of data across +4057 languages, extending the language coverage of SSL models 4-fold. We +combine 1 million hours of speech from existing publicly accessible corpora +with a newly created corpus of 7400+ hours from 4057 languages, which will be +publicly released. To handle the diverse conditions of multilingual speech +data, we augment the typical SSL masked prediction approach with a novel +dereverberation objective, increasing robustness. We evaluate XEUS on several +benchmarks, and show that it consistently outperforms or achieves comparable +results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS +sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT +2.0 v2 by 0.8% and 4.4% respectively, despite having less parameters or +pre-training data. Checkpoints, code, and data are found in +https://www.wavlab.org/activities/2024/xeus/. + +
+
+ comment: Updated affiliations; 20 pages +
+
+
+
+
+ + ♻ ☆ Matching domain experts by training from scratch on domain knowledge ICML 2024 + + +
+ Recently, large language models (LLMs) have outperformed human experts in +predicting the results of neuroscience experiments (Luo et al., 2024). What is +the basis for this performance? One possibility is that statistical patterns in +that specific scientific literature, as opposed to emergent reasoning abilities +arising from broader training, underlie LLMs' performance. To evaluate this +possibility, we trained (next word prediction) a relatively small +124M-parameter GPT-2 model on 1.3 billion tokens of domain-specific knowledge. +Despite being orders of magnitude smaller than larger LLMs trained on trillions +of tokens, small models achieved expert-level performance in predicting +neuroscience results. Small models trained on the neuroscience literature +succeeded when they were trained from scratch using a tokenizer specifically +trained on neuroscience text or when the neuroscience literature was used to +finetune a pretrained GPT-2. Our results indicate that expert-level performance +may be attained by even small LLMs through domain-specific, auto-regressive +training approaches. + +
+
+ comment: ICML 2024 (Large Language Models and Cognition) +
+
+
+
+
+ + ♻ ☆ Let Guidelines Guide You: A Prescriptive Guideline-Centered Data + Annotation Methodology + + +
+ We introduce the Guideline-Centered annotation process, a novel data +annotation methodology focused on reporting the annotation guidelines +associated with each data sample. We identify three main limitations of the +standard prescriptive annotation process and describe how the +Guideline-Centered methodology overcomes them by reducing the loss of +information in the annotation process and ensuring adherence to guidelines. +Additionally, we discuss how the Guideline-Centered enables the reuse of +annotated data across multiple tasks at the cost of a single human-annotation +process. + +
+
+
+
+
+ + ♻ ☆ Uncovering Safety Risks of Large Language Models through Concept + Activation Vector + + +
+ Despite careful safety alignment, current large language models (LLMs) remain +vulnerable to various attacks. To further unveil the safety risks of LLMs, we +introduce a Safety Concept Activation Vector (SCAV) framework, which +effectively guides the attacks by accurately interpreting LLMs' safety +mechanisms. We then develop an SCAV-guided attack method that can generate both +attack prompts and embedding-level attacks with automatically selected +perturbation hyperparameters. Both automatic and human evaluations demonstrate +that our attack method significantly improves the attack success rate and +response quality while requiring less training data. Additionally, we find that +our generated attack prompts may be transferable to GPT-4, and the +embedding-level attacks may also be transferred to other white-box LLMs whose +parameters are known. Our experiments further uncover the safety risks present +in current LLMs. For example, we find that six out of seven open-source LLMs +that we attack consistently provide relevant answers to more than 85\% +malicious instructions. Finally, we provide insights into the safety mechanism +of LLMs. + +
+
+
+
+
+ + ♻ ☆ Observational Scaling Laws and the Predictability of Language Model + Performance + + +
+ Understanding how language model performance varies with scale is critical to +benchmark and algorithm development. Scaling laws are one approach to building +this understanding, but the requirement of training models across many +different scales has limited their use. We propose an alternative, +observational approach that bypasses model training and instead builds scaling +laws from ~80 publically available models. Building a single scaling law from +multiple model families is challenging due to large variations in their +training compute efficiencies and capabilities. However, we show that these +variations are consistent with a simple, generalized scaling law where language +model performance is a function of a low-dimensional capability space, and +model families only vary in their efficiency in converting training compute to +capabilities. Using this approach, we show the surprising predictability of +complex scaling phenomena: we show that several emergent phenomena follow a +smooth, sigmoidal behavior and are predictable from small models; we show that +the agent performance of models such as GPT-4 can be precisely predicted from +simpler non-agentic benchmarks; and we show how to predict the impact of +post-training interventions like Chain-of-Thought and Self-Consistency as +language model capabilities continue to improve. + +
+
+
+
+
+ + ♻ ☆ Are there identifiable structural parts in the sentence embedding whole? + + +
+ Sentence embeddings from transformer models encode in a fixed length vector +much linguistic information. We explore the hypothesis that these embeddings +consist of overlapping layers of information that can be separated, and on +which specific types of information -- such as information about chunks and +their structural and semantic properties -- can be detected. We show that this +is the case using a dataset consisting of sentences with known chunk structure, +and two linguistic intelligence datasets, solving which relies on detecting +chunks and their grammatical number, and respectively, their semantic roles, +and through analyses of the performance on the tasks and of the internal +representations built during learning. + +
+
+ comment: 17 pages, 14 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ The Missing Piece in Model Editing: A Deep Dive into the Hidden Damage + Brought By Model Editing + + +
+ Large Language Models have revolutionized numerous tasks with their +remarkable efficacy. However, editing these models, crucial for rectifying +outdated or erroneous information, often leads to a complex issue known as the +ripple effect in the hidden space. While difficult to detect, this effect can +significantly impede the efficacy of model editing tasks and deteriorate model +performance. This paper addresses this scientific challenge by proposing a +novel evaluation methodology, Graphical Impact Evaluation(GIE), which +quantitatively evaluates the adaptations of the model and the subsequent impact +of editing. Furthermore, we introduce the Selective Impact Revision(SIR), a +model editing method designed to mitigate this ripple effect. Our comprehensive +evaluations reveal that the ripple effect in the hidden space is a significant +issue in all current model editing methods. However, our proposed methods, GIE +and SIR, effectively identify and alleviate this issue, contributing to the +advancement of LLM editing techniques. + +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Exhibit Cognitive Dissonance? Studying the + Difference Between Revealed Beliefs and Stated Answers + + +
+ Prompting and Multiple Choices Questions (MCQ) have become the preferred +approach to assess the capabilities of Large Language Models (LLMs), due to +their ease of manipulation and evaluation. Such experimental appraisals have +pointed toward the LLMs' apparent ability to perform causal reasoning or to +grasp uncertainty. In this paper, we investigate whether these abilities are +measurable outside of tailored prompting and MCQ by reformulating these issues +as direct text completion - the foundation of LLMs. To achieve this goal, we +define scenarios with multiple possible outcomes and we compare the prediction +made by the LLM through prompting (their Stated Answer) to the probability +distributions they compute over these outcomes during next token prediction +(their Revealed Belief). Our findings suggest that the Revealed Belief of LLMs +significantly differs from their Stated Answer and hint at multiple biases and +misrepresentations that their beliefs may yield in many scenarios and outcomes. +As text completion is at the core of LLMs, these results suggest that common +evaluation methods may only provide a partial picture and that more research is +needed to assess the extent and nature of their capabilities. + +
+
+
+
+
+ + ♻ ☆ Evaluating Character Understanding of Large Language Models via + Character Profiling from Fictional Works + + +
+ Large language models (LLMs) have demonstrated impressive performance and +spurred numerous AI applications, in which role-playing agents (RPAs) are +particularly popular, especially for fictional characters. The prerequisite for +these RPAs lies in the capability of LLMs to understand characters from +fictional works. Previous efforts have evaluated this capability via basic +classification tasks or characteristic imitation, failing to capture the +nuanced character understanding with LLMs. In this paper, we propose evaluating +LLMs' character understanding capability via the character profiling task, +i.e., summarizing character profiles from corresponding materials, a widely +adopted yet understudied practice for RPA development. Specifically, we +construct the CroSS dataset from literature experts and assess the generated +profiles by comparing ground truth references and their applicability in +downstream tasks. Our experiments, which cover various summarization methods +and LLMs, have yielded promising results. These results strongly validate the +character understanding capability of LLMs. Resources are available at +https://github.com/Joanna0123/character_profiling. + +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning for Edit-Based Non-Autoregressive Neural Machine + Translation NAACL + + +
+ Non-autoregressive (NAR) language models are known for their low latency in +neural machine translation (NMT). However, a performance gap exists between NAR +and autoregressive models due to the large decoding space and difficulty in +capturing dependency between target words accurately. Compounding this, +preparing appropriate training data for NAR models is a non-trivial task, often +exacerbating exposure bias. To address these challenges, we apply reinforcement +learning (RL) to Levenshtein Transformer, a representative edit-based NAR +model, demonstrating that RL with self-generated data can enhance the +performance of edit-based NAR models. We explore two RL approaches: stepwise +reward maximization and episodic reward maximization. We discuss the respective +pros and cons of these two approaches and empirically verify them. Moreover, we +experimentally investigate the impact of temperature setting on performance, +confirming the importance of proper temperature setting for NAR models' +training. + +
+
+ comment: NAACL SRW 2024 +
+
+
+
+
+ + ♻ ☆ Kanbun-LM: Reading and Translating Classical Chinese in Japanese Methods + by Language Models ACL 2023 + + +
+ Recent studies in natural language processing (NLP) have focused on modern +languages and achieved state-of-the-art results in many tasks. Meanwhile, +little attention has been paid to ancient texts and related tasks. Classical +Chinese first came to Japan approximately 2,000 years ago. It was gradually +adapted to a Japanese form called Kanbun-Kundoku (Kanbun) in Japanese reading +and translating methods, which has significantly impacted Japanese literature. +However, compared to the rich resources for ancient texts in mainland China, +Kanbun resources remain scarce in Japan. To solve this problem, we construct +the first Classical-Chinese-to-Kanbun dataset in the world. Furthermore, we +introduce two tasks, character reordering and machine translation, both of +which play a significant role in Kanbun comprehension. We also test the current +language models on these tasks and discuss the best evaluation method by +comparing the results with human scores. We release our code and dataset on +GitHub. + +
+
+ comment: Findings of ACL 2023 +
+
+
+
+
+ + ♻ ☆ LLM-Oracle Machines + + +
+ Contemporary AI applications leverage large language models (LLMs) to harness +their knowledge and reasoning abilities for natural language processing tasks. +This approach shares similarities with the concept of oracle Turing machines +(OTMs). To capture the broader potential of these computations, including those +not yet realized, we propose an extension to OTMs: the LLM-oracle machine +(LLM-OM), by employing a cluster of LLMs as the oracle. Each LLM acts as a +black box, capable of answering queries within its expertise, albeit with a +delay. We introduce four variants of the LLM-OM: basic, augmented, +fault-avoidance, and $\epsilon$-fault. The first two are commonly observed in +existing AI applications. The latter two are specifically designed to address +the challenges of LLM hallucinations, biases, and inconsistencies, aiming to +ensure reliable outcomes. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ ColPali: Efficient Document Retrieval with Vision Language Models + + +
+ Documents are visually rich structures that convey information through text, +as well as tables, figures, page layouts, or fonts. While modern document +retrieval systems exhibit strong performance on query-to-text matching, they +struggle to exploit visual cues efficiently, hindering their performance on +practical document retrieval applications such as Retrieval Augmented +Generation. To benchmark current systems on visually rich document retrieval, +we introduce the Visual Document Retrieval Benchmark ViDoRe, composed of +various page-level retrieving tasks spanning multiple domains, languages, and +settings. The inherent shortcomings of modern systems motivate the introduction +of a new retrieval model architecture, ColPali, which leverages the document +understanding capabilities of recent Vision Language Models to produce +high-quality contextualized embeddings solely from images of document pages. +Combined with a late interaction matching mechanism, ColPali largely +outperforms modern document retrieval pipelines while being drastically faster +and end-to-end trainable. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Open-Source Conversational AI with SpeechBrain 1.0 + + +
+ SpeechBrain is an open-source Conversational AI toolkit based on PyTorch, +focused particularly on speech processing tasks such as speech recognition, +speech enhancement, speaker recognition, text-to-speech, and much more. It +promotes transparency and replicability by releasing both the pre-trained +models and the complete "recipes" of code and algorithms required for training +them. This paper presents SpeechBrain 1.0, a significant milestone in the +evolution of the toolkit, which now has over 200 recipes for speech, audio, and +language processing tasks, and more than 100 models available on Hugging Face. +SpeechBrain 1.0 introduces new technologies to support diverse learning +modalities, Large Language Model (LLM) integration, and advanced decoding +strategies, along with novel models, tasks, and modalities. It also includes a +new benchmark repository, offering researchers a unified platform for +evaluating models across diverse tasks + +
+
+ comment: Submitted to JMLR (Machine Learning Open Source Software) +
+
+
+
+
+ + ♻ ☆ MM-MATH: Advancing Multimodal Math Evaluation with Process Evaluation + and Fine-grained Classification + + +
+ To advance the evaluation of multimodal math reasoning in large multimodal +models (LMMs), this paper introduces a novel benchmark, MM-MATH. MM-MATH +consists of 5,929 open-ended middle school math problems with visual contexts, +with fine-grained classification across difficulty, grade level, and knowledge +points. Unlike existing benchmarks relying on binary answer comparison, MM-MATH +incorporates both outcome and process evaluations. Process evaluation employs +LMM-as-a-judge to automatically analyze solution steps, identifying and +categorizing errors into specific error types. Extensive evaluation of ten +models on MM-MATH reveals significant challenges for existing LMMs, +highlighting their limited utilization of visual information and struggles with +higher-difficulty problems. The best-performing model achieves only 31% +accuracy on MM-MATH, compared to 82% for humans. This highlights the +challenging nature of our benchmark for existing models and the significant gap +between the multimodal reasoning capabilities of current models and humans. Our +process evaluation reveals that diagram misinterpretation is the most common +error, accounting for more than half of the total error cases, underscoring the +need for improved image comprehension in multimodal reasoning. + +
+
+
+
+
+ + ♻ ☆ HGOT: Hierarchical Graph of Thoughts for Retrieval-Augmented In-Context + Learning in Factuality Evaluation + + +
+ With the widespread adoption of large language models (LLMs) in numerous +applications, the challenge of factuality and the propensity for hallucinations +has emerged as a significant concern. To address this issue, particularly in +retrieval-augmented in-context learning, we introduce the hierarchical graph of +thoughts (HGOT), a structured, multi-layered graph approach designed to enhance +the retrieval of pertinent passages during in-context learning. The framework +utilizes the emergent planning capabilities of LLMs, employing the +divide-and-conquer strategy to break down complex queries into manageable +sub-queries. It refines self-consistency majority voting for answer selection, +which incorporates the recently proposed citation recall and precision metrics +to assess the quality of thoughts, linking an answer's credibility +intrinsically to the thought's quality. This methodology introduces a weighted +system in majority voting, prioritizing answers based on the citation quality +of their thoughts. Additionally, we propose a scoring mechanism for evaluating +retrieved passages, considering factors such as citation frequency and quality, +self-consistency confidence, and the retrieval module's ranking. Experiments +indicate that HGOT excels as a versatile approach, outperforming competing +models in FEVER by up to $7\%$ and matching leading models such as +Retrieve-then-Read in Open-SQuAD, and DSP in HotPotQA, demonstrating its +efficacy in enhancing LLMs' factuality. + +
+
+
+
+
+ + ♻ ☆ Cocktail: A Comprehensive Information Retrieval Benchmark with + LLM-Generated Documents Integration ACL 2024 + + +
+ The proliferation of Large Language Models (LLMs) has led to an influx of +AI-generated content (AIGC) on the internet, transforming the corpus of +Information Retrieval (IR) systems from solely human-written to a coexistence +with LLM-generated content. The impact of this surge in AIGC on IR systems +remains an open question, with the primary challenge being the lack of a +dedicated benchmark for researchers. In this paper, we introduce Cocktail, a +comprehensive benchmark tailored for evaluating IR models in this mixed-sourced +data landscape of the LLM era. Cocktail consists of 16 diverse datasets with +mixed human-written and LLM-generated corpora across various text retrieval +tasks and domains. Additionally, to avoid the potential bias from previously +included dataset information in LLMs, we also introduce an up-to-date dataset, +named NQ-UTD, with queries derived from recent events. Through conducting over +1,000 experiments to assess state-of-the-art retrieval models against the +benchmarked datasets in Cocktail, we uncover a clear trade-off between ranking +performance and source bias in neural retrieval models, highlighting the +necessity for a balanced approach in designing future IR systems. We hope +Cocktail can serve as a foundational resource for IR research in the LLM era, +with all data and code publicly available at +\url{https://github.com/KID-22/Cocktail}. + +
+
+ comment: Accepted by Findings of ACL 2024; Datasets Link: + https://huggingface.co/IR-Cocktail +
+
+
+
+
+ + ♻ ☆ TEII: Think, Explain, Interact and Iterate with Large Language Models to + Solve Cross-lingual Emotion Detection ACL 2024 + + +
+ Cross-lingual emotion detection allows us to analyze global trends, public +opinion, and social phenomena at scale. We participated in the Explainability +of Cross-lingual Emotion Detection (EXALT) shared task, achieving an F1-score +of 0.6046 on the evaluation set for the emotion detection sub-task. Our system +outperformed the baseline by more than 0.16 F1-score absolute, and ranked +second amongst competing systems. We conducted experiments using fine-tuning, +zero-shot learning, and few-shot learning for Large Language Model (LLM)-based +models as well as embedding-based BiLSTM and KNN for non-LLM-based techniques. +Additionally, we introduced two novel methods: the Multi-Iteration Agentic +Workflow and the Multi-Binary-Classifier Agentic Workflow. We found that +LLM-based approaches provided good performance on multilingual emotion +detection. Furthermore, ensembles combining all our experimented models yielded +higher F1-scores than any single approach alone. + +
+
+ comment: Proceedings of the 13th Workshop on Computational Approaches to + Subjectivity, Sentiment, & Social Media Analysis (ACL 2024) +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Instructions for Aligning Large Language Models ICML 2024 + + +
+ Service providers of large language model (LLM) applications collect user +instructions in the wild and use them in further aligning LLMs with users' +intentions. These instructions, which potentially contain sensitive +information, are annotated by human workers in the process. This poses a new +privacy risk not addressed by the typical private optimization. To this end, we +propose using synthetic instructions to replace real instructions in data +annotation and model fine-tuning. Formal differential privacy is guaranteed by +generating those synthetic instructions using privately fine-tuned generators. +Crucial in achieving the desired utility is our novel filtering algorithm that +matches the distribution of the synthetic instructions to that of the real +ones. In both supervised fine-tuning and reinforcement learning from human +feedback, our extensive experiments demonstrate the high utility of the final +set of synthetic instructions by showing comparable results to real +instructions. In supervised fine-tuning, models trained with private synthetic +instructions outperform leading open-source models such as Vicuna. + +
+
+ comment: ICML 2024. Code available at + https://github.com/google-research/google-research/tree/master/dp_instructions +
+
+
+
+
+ + ♻ ☆ How Reliable Are Automatic Evaluation Methods for Instruction-Tuned + LLMs? + + +
+ Work on instruction-tuned Large Language Models (LLMs) has used automatic +methods based on text overlap and LLM judgments as cost-effective alternatives +to human evaluation. In this paper, we perform a meta-evaluation of such +methods and assess their reliability across a broad range of tasks. We observe +that while automatic evaluation methods can approximate human ratings under +specific conditions, their validity is highly context-dependent. Specifically, +the simple ROUGE-L metric correlates well with human ratings for short-answer +English tasks but is unreliable in free-form generation tasks and cross-lingual +transfer. The effectiveness of the more advanced method of using GPT-4 as a +judge diminishes significantly if reference answers are not included in the +prompt, which is the scenario where this method has the potential to provide +the most value compared to other metrics. Our findings enhance the +understanding of how automatic methods should be applied and interpreted when +developing and evaluating instruction-tuned LLMs. + +
+
+
+
+
+ + ♻ ☆ GSQA: An End-to-End Model for Generative Spoken Question Answering + + +
+ In recent advancements in spoken question answering (QA), end-to-end models +have made significant strides. However, previous research has primarily focused +on extractive span selection. While this extractive-based approach is effective +when answers are present directly within the input, it falls short in +addressing abstractive questions, where answers are not directly extracted but +inferred from the given information. To bridge this gap, we introduce the first +end-to-end Generative Spoken Question Answering (GSQA) model that empowers the +system to engage in abstractive reasoning. The challenge in training our GSQA +model lies in the absence of a spoken abstractive QA dataset. We propose using +text models for initialization and leveraging the extractive QA dataset to +transfer knowledge from the text generative model to the spoken generative +model. Experimental results indicate that our model surpasses the previous +extractive model by 3% on extractive QA datasets. Furthermore, the GSQA model +has only been fine-tuned on the spoken extractive QA dataset. Despite not +having seen any spoken abstractive QA data, it can still closely match the +performance of the cascade model. In conclusion, our GSQA model shows the +potential to generalize to a broad spectrum of questions, thus further +expanding the spoken question answering capabilities of abstractive QA. Our +code is available at https://voidful.github.io/GSQA + +
+
+ comment: 5 pages, 2 figures, Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ A Modular Approach for Multimodal Summarization of TV Shows + + +
+ In this paper we address the task of summarizing television shows, which +touches key areas in AI research: complex reasoning, multiple modalities, and +long narratives. We present a modular approach where separate components +perform specialized sub-tasks which we argue affords greater flexibility +compared to end-to-end methods. Our modules involve detecting scene boundaries, +reordering scenes so as to minimize the number of cuts between different +events, converting visual information to text, summarizing the dialogue in each +scene, and fusing the scene summaries into a final summary for the entire +episode. We also present a new metric, PRISMA (Precision and Recall EvaluatIon +of Summary FActs), to measure both precision and recall of generated summaries, +which we decompose into atomic facts. Tested on the recently released +SummScreen3D dataset, our method produces higher quality summaries than +comparison models, as measured with ROUGE and our new fact-based metric, and as +assessed by human evaluators. + +
+
+
+
+
+ + ♻ ☆ Using eye tracking to investigate what native Chinese speakers notice + about linguistic landscape images + + +
+ Linguistic landscape is an important field in sociolinguistic research. Eye +tracking technology is a common technology in psychological research. There are +few cases of using eye movement to study linguistic landscape. This paper uses +eye tracking technology to study the actual fixation of the linguistic +landscape and finds that in the two dimensions of fixation time and fixation +times, the fixation of native Chinese speakers to the linguistic landscape is +higher than that of the general landscape. This paper argues that this +phenomenon is due to the higher information density of linguistic landscapes. +At the same time, the article also discusses other possible reasons for this +phenomenon. + +
+
+
+
+
+ + ♻ ☆ QOG:Question and Options Generation based on Language Model + + +
+ Question-Options Generation (QOG) is a task that involves generating a set of +question-options pairs given context. This task has various applications, +including fine-tuning large models, information retrieval, and automated +multiple-choice question generation for education. In this paper, we develop +QOG models using three different methods based on fine-tuning +sequence-to-sequence language models (LMs). Experiments demonstrate that the +end-to-end QOG model is computationally efficient and stable during both +training and inference, outperforming other methods. Furthermore, our analysis +indicates that our QOG models are competitive on the QOG task compared to the +large language model Llama 3-8B. + +
+
+
+
+
+ + ♻ ☆ A Closer Look at Classification Evaluation Metrics and a Critical + Reflection of Common Evaluation Practice ACL + + +
+ Classification systems are evaluated in a countless number of papers. +However, we find that evaluation practice is often nebulous. Frequently, +metrics are selected without arguments, and blurry terminology invites +misconceptions. For instance, many works use so-called 'macro' metrics to rank +systems (e.g., 'macro F1') but do not clearly specify what they would expect +from such a `macro' metric. This is problematic, since picking a metric can +affect research findings, and thus any clarity in the process should be +maximized. + Starting from the intuitive concepts of bias and prevalence, we perform an +analysis of common evaluation metrics. The analysis helps us understand the +metrics' underlying properties, and how they align with expectations as found +expressed in papers. Then we reflect on the practical situation in the field, +and survey evaluation practice in recent shared tasks. We find that metric +selection is often not supported with convincing arguments, an issue that can +make a system ranking seem arbitrary. Our work aims at providing overview and +guidance for more informed and transparent metric selection, fostering +meaningful evaluation. + +
+
+ comment: appeared in TACL journal. MIT press publication available at + https://doi.org/10.1162/tacl_a_00675 +
+
+
+
+
+ + ♻ ☆ Leveraging Large Language Models for Actionable Course Evaluation + Student Feedback to Lecturers + + +
+ End of semester student evaluations of teaching are the dominant mechanism +for providing feedback to academics on their teaching practice. For large +classes, however, the volume of feedback makes these tools impractical for this +purpose. This paper explores the use of open-source generative AI to synthesise +factual, actionable and appropriate summaries of student feedback from these +survey responses. In our setup, we have 742 student responses ranging over 75 +courses in a Computer Science department. For each course, we synthesise a +summary of the course evaluations and actionable items for the instructor. Our +results reveal a promising avenue for enhancing teaching practices in the +classroom setting. Our contribution lies in demonstrating the feasibility of +using generative AI to produce insightful feedback for teachers, thus providing +a cost-effective means to support educators' development. Overall, our work +highlights the possibility of using generative AI to produce factual, +actionable, and appropriate feedback for teachers in the classroom setting. + +
+
+ comment: Accepted to SEFI 2024 +
+
+
+
+
+ + ♻ ☆ LexMatcher: Dictionary-centric Data Collection for LLM-based Machine + Translation + + +
+ The fine-tuning of open-source large language models (LLMs) for machine +translation has recently received considerable attention, marking a shift +towards data-centric research from traditional neural machine translation. +However, the area of data collection for instruction fine-tuning in machine +translation remains relatively underexplored. In this paper, we present +LexMatcher, a simple yet effective method for data curation, the design of +which is driven by the coverage of senses found in bilingual dictionaries. The +construction process comprises data retrieval from an existing corpus and data +augmentation that supplements the infrequent senses of polysemous words. +Utilizing LLaMA2 as our base model, our approach outperforms the established +baselines on the WMT2022 test sets and also exhibits remarkable performance in +tasks related to word sense disambiguation and specialized terminology +translation. These results underscore the effectiveness of LexMatcher in +enhancing LLM-based machine translation. The code, data, and models are +available at https://github.com/ARIES-LM/Lexmatcher-MT.git. + +
+
+
+
+
+ + ♻ ☆ Data Augmentation using Large Language Models: Data Perspectives, + Learning Paradigms and Challenges + + +
+ In the rapidly evolving field of large language models (LLMs), data +augmentation (DA) has emerged as a pivotal technique for enhancing model +performance by diversifying training examples without the need for additional +data collection. This survey explores the transformative impact of LLMs on DA, +particularly addressing the unique challenges and opportunities they present in +the context of natural language processing (NLP) and beyond. From both data and +learning perspectives, we examine various strategies that utilize LLMs for data +augmentation, including a novel exploration of learning paradigms where +LLM-generated data is used for diverse forms of further training. Additionally, +this paper highlights the primary open challenges faced in this domain, ranging +from controllable data augmentation to multi-modal data augmentation. This +survey highlights a paradigm shift introduced by LLMs in DA, and aims to serve +as a comprehensive guide for researchers and practitioners. + +
+
+
+
+
+ + ♻ ☆ Show Less, Instruct More: Enriching Prompts with Definitions and + Guidelines for Zero-Shot NER + + +
+ Recently, several specialized instruction-tuned Large Language Models (LLMs) +for Named Entity Recognition (NER) have emerged. Compared to traditional NER +approaches, these models have strong generalization capabilities. Existing LLMs +mainly focus on zero-shot NER in out-of-domain distributions, being fine-tuned +on an extensive number of entity classes that often highly or completely +overlap with test sets. In this work instead, we propose SLIMER, an approach +designed to tackle never-seen-before named entity tags by instructing the model +on fewer examples, and by leveraging a prompt enriched with definition and +guidelines. Experiments demonstrate that definition and guidelines yield better +performance, faster and more robust learning, particularly when labelling +unseen Named Entities. Furthermore, SLIMER performs comparably to +state-of-the-art approaches in out-of-domain zero-shot NER, while being trained +on a reduced tag set. + +
+
+
+
+
+ + ♻ ☆ Empowering 3D Visual Grounding with Reasoning Capabilities ECCV 2024 + + +
+ Although great progress has been made in 3D visual grounding, current models +still rely on explicit textual descriptions for grounding and lack the ability +to reason human intentions from implicit instructions. We propose a new task +called 3D reasoning grounding and introduce a new benchmark ScanReason which +provides over 10K question-answer-location pairs from five reasoning types that +require the synerization of reasoning and grounding. We further design our +approach, ReGround3D, composed of the visual-centric reasoning module empowered +by Multi-modal Large Language Model (MLLM) and the 3D grounding module to +obtain accurate object locations by looking back to the enhanced geometry and +fine-grained details from the 3D scenes. A chain-of-grounding mechanism is +proposed to further boost the performance with interleaved reasoning and +grounding steps during inference. Extensive experiments on the proposed +benchmark validate the effectiveness of our proposed approach. + +
+
+ comment: Accepted by ECCV 2024. A comprehensive and hierarchical 3D reasoning + grounding benchmark in the era of foundation models. Project page: + https://zcmax.github.io/projects/ScanReason +
+
+
+
+
+ + ♻ ☆ Natural Language Can Help Bridge the Sim2Real Gap + + +
+ The main challenge in learning image-conditioned robotic policies is +acquiring a visual representation conducive to low-level control. Due to the +high dimensionality of the image space, learning a good visual representation +requires a considerable amount of visual data. However, when learning in the +real world, data is expensive. Sim2Real is a promising paradigm for overcoming +data scarcity in the real-world target domain by using a simulator to collect +large amounts of cheap data closely related to the target task. However, it is +difficult to transfer an image-conditioned policy from sim to real when the +domains are very visually dissimilar. To bridge the sim2real visual gap, we +propose using natural language descriptions of images as a unifying signal +across domains that captures the underlying task-relevant semantics. Our key +insight is that if two image observations from different domains are labeled +with similar language, the policy should predict similar action distributions +for both images. We demonstrate that training the image encoder to predict the +language description or the distance between descriptions of a sim or real +image serves as a useful, data-efficient pretraining step that helps learn a +domain-invariant image representation. We can then use this image encoder as +the backbone of an IL policy trained simultaneously on a large amount of +simulated and a handful of real demonstrations. Our approach outperforms widely +used prior sim2real methods and strong vision-language pretraining baselines +like CLIP and R3M by 25 to 40%. See additional videos and materials at +https://robin-lab.cs.utexas.edu/lang4sim2real/. + +
+
+ comment: To appear in RSS 2024. Project website at + https://robin-lab.cs.utexas.edu/lang4sim2real/ +
+
+
+
+
+ + ♻ ☆ DynaSemble: Dynamic Ensembling of Textual and Structure-Based Models for + Knowledge Graph Completion ACL 2024 + + +
+ We consider two popular approaches to Knowledge Graph Completion (KGC): +textual models that rely on textual entity descriptions, and structure-based +models that exploit the connectivity structure of the Knowledge Graph (KG). +Preliminary experiments show that these approaches have complementary +strengths: structure-based models perform exceptionally well when the gold +answer is easily reachable from the query head in the KG, while textual models +exploit descriptions to give good performance even when the gold answer is not +easily reachable. In response, we propose DynaSemble, a novel method for +learning query-dependent ensemble weights to combine these approaches by using +the distributions of scores assigned by the models in the ensemble to all +candidate entities. DynaSemble achieves state-of-the-art results on three +standard KGC datasets, with up to 6.8 pt MRR and 8.3 pt Hits@1 gains over the +best baseline model for the WN18RR dataset. + +
+
+ comment: 12 pages, 2 figures, 15 tables Accepted to ACL 2024 +
+
+
+
+
+ + ♻ ☆ GraphWiz: An Instruction-Following Language Model for Graph Problems + + +
+ Large language models (LLMs) have achieved impressive success across several +fields, but their proficiency in understanding and resolving complex graph +problems is less explored. To bridge this gap, we introduce GraphInstruct, a +novel and comprehensive instruction-tuning dataset designed to equip language +models with the ability to tackle a broad spectrum of graph problems using +explicit reasoning paths. Utilizing GraphInstruct, we build GraphWiz, an +open-source language model capable of resolving various graph problem types +while generating clear reasoning processes. To enhance the model's capability +and reliability, we incorporate the Direct Preference Optimization (DPO) +framework into the graph problem-solving context. The enhanced model, +GraphWiz-DPO, achieves an average accuracy of 65% across nine tasks with +different complexity levels, surpassing GPT-4 which has an average accuracy of +43.8%. Moreover, our research delves into the delicate balance between training +data volume and model performance, highlighting the potential for overfitting +with increased data. We also explore the transferability of the model's +reasoning ability across different graph tasks, indicating the model's +adaptability and practical application potential. Our investigation offers a +new blueprint and valuable insights for developing LLMs specialized in graph +reasoning and problem-solving. + +
+
+ comment: 27pages, 15 tables +
+
+
+
+
+ + ♻ ☆ HyperLoader: Integrating Hypernetwork-Based LoRA and Adapter Layers into + Multi-Task Transformers for Sequence Labelling + + +
+ We present HyperLoader, a simple approach that combines different +parameter-efficient fine-tuning methods in a multi-task setting. To achieve +this goal, our model uses a hypernetwork to generate the weights of these +modules based on the task, the transformer layer, and its position within this +layer. Our method combines the benefits of multi-task learning by capturing the +structure of all tasks while reducing the task interference problem by +encapsulating the task-specific knowledge in the generated weights and the +benefits of combining different parameter-efficient methods to outperform +full-fine tuning. We provide empirical evidence that HyperLoader outperforms +previous approaches in most datasets and obtains the best average performance +across tasks in high-resource and low-resource scenarios. + +
+
+
+
+
+ + ♻ ☆ Enhancing Large Language Models in Coding Through Multi-Perspective + Self-Consistency ACL 2024 + + +
+ Large language models (LLMs) have exhibited remarkable ability in code +generation. However, generating the correct solution in a single attempt still +remains a challenge. Prior works utilize verification properties in software +engineering to verify and re-rank solutions in a majority voting manner. But +the assumption behind them that generated verification properties have better +qualities than solutions may not always hold. In this paper, we treat them +equally as different perspectives of LLMs' reasoning processes. We propose the +Multi-Perspective Self-Consistency (MPSC) framework incorporating both inter- +and intra-consistency across outputs from multiple perspectives. Specifically, +we prompt LLMs to generate diverse outputs from three perspectives, Solution, +Specification and Test case, constructing a 3-partite graph. With two measure +functions of consistency, we embed both inter- and intra-consistency +information into the graph. The optimal choice of solutions is then determined +based on analysis in the graph. MPSC significantly boosts performance of +foundation models (ChatGPT in this paper) on various benchmarks, including +HumanEval (+15.91%), MBPP (+6.43%) and CodeContests (+9.37%), even surpassing +GPT-4. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ PROC2PDDL: Open-Domain Planning Representations from Texts + + +
+ Planning in a text-based environment continues to be a major challenge for AI +systems. Recent approaches have used language models to predict a planning +domain definition (e.g., PDDL) but have only been evaluated in closed-domain +simulated environments. To address this, we present Proc2PDDL , the first +dataset containing open-domain procedural texts paired with expert-annotated +PDDL representations. Using this dataset, we evaluate state-of-the-art models +on defining the preconditions and effects of actions. We show that Proc2PDDL is +highly challenging, with GPT-3.5's success rate close to 0% and GPT-4's around +35%. Our analysis shows both syntactic and semantic errors, indicating LMs' +deficiency in both generating domain-specific prgorams and reasoning about +events. We hope this analysis and dataset helps future progress towards +integrating the best of LMs and formal planning. + +
+
+ comment: In NLRSE 2024, the 2nd Natural Language Reasoning and Structured + Explanations Workshop +
+
+
+
+
+ + ♻ ☆ Step-Controlled DPO: Leveraging Stepwise Error for Enhanced Mathematical + Reasoning + + +
+ Direct Preference Optimization (DPO) has proven effective at improving the +performance of large language models (LLMs) on downstream tasks such as +reasoning and alignment. In this work, we propose Step-Controlled DPO (SCDPO), +a method for automatically providing stepwise error supervision by creating +negative samples of mathematical reasoning rationales that start making errors +at a specified step. By applying these samples in DPO training, SCDPO can +better align the model to understand reasoning errors and output accurate +reasoning steps. We apply SCDPO to both code-integrated and chain-of-thought +solutions, empirically showing that it consistently improves the performance +compared to naive DPO on three different SFT models, including one existing SFT +model and two models we finetuned. Qualitative analysis of the credit +assignment of SCDPO and DPO demonstrates the effectiveness of SCDPO at +identifying errors in mathematical solutions. We then apply SCDPO to an +InternLM2-20B model, resulting in a 20B model that achieves high scores of +88.5% on GSM8K and 58.1% on MATH, rivaling all other open-source LLMs, showing +the great potential of our method. + +
+
+
+
+
+ + ♻ ☆ Aligning Large Language Models with Human Preferences through + Representation Engineering + + +
+ Aligning large language models (LLMs) with human preferences is crucial for +enhancing their utility in terms of helpfulness, truthfulness, safety, +harmlessness, and interestingness. Existing methods for achieving this +alignment often involves employing reinforcement learning from human feedback +(RLHF) to fine-tune LLMs based on human labels assessing the relative quality +of model responses. Nevertheless, RLHF is susceptible to instability during +fine-tuning and presents challenges in implementation.Drawing inspiration from +the emerging field of representation engineering (RepE), this study aims to +identify relevant representations for high-level human preferences embedded in +patterns of activity within an LLM, and achieve precise control of model +behavior by transforming its representations. This novel approach, denoted as +Representation Alignment from Human Feedback (RAHF), proves to be effective, +computationally efficient, and easy to implement.Extensive experiments +demonstrate the efficacy of RAHF in not only capturing but also manipulating +representations to align with a broad spectrum of human preferences or values, +rather than being confined to a singular concept or function (e.g. honesty or +bias). RAHF's versatility in accommodating diverse human preferences shows its +potential for advancing LLM performance. + +
+
+
+
+
+ + ♻ ☆ GSM-Plus: A Comprehensive Benchmark for Evaluating the Robustness of + LLMs as Mathematical Problem Solvers ACL 2024 + + +
+ Large language models (LLMs) have achieved impressive performance across +various mathematical reasoning benchmarks. However, there are increasing +debates regarding whether these models truly understand and apply mathematical +knowledge or merely rely on shortcuts for mathematical reasoning. One essential +and frequently occurring evidence is that when the math questions are slightly +changed, LLMs can behave incorrectly. This motivates us to evaluate the +robustness of LLMs' math reasoning capability by testing a wide range of +question variations. We introduce the adversarial grade school math (GSM-Plus) +dataset, an extension of GSM8K augmented with various mathematical +perturbations. Our experiments on 25 LLMs and 4 prompting techniques show that +while LLMs exhibit different levels of math reasoning abilities, their +performances are far from robust. In particular, even for problems that have +been solved in GSM8K, LLMs can make mistakes when new statements are added or +the question targets are altered. We also explore whether more robust +performance can be achieved by composing existing prompting methods, in which +we try an iterative method that generates and verifies each intermediate +thought based on its reasoning goal and calculation result. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ StyleDubber: Towards Multi-Scale Style Learning for Movie Dubbing + + +
+ Given a script, the challenge in Movie Dubbing (Visual Voice Cloning, V2C) is +to generate speech that aligns well with the video in both time and emotion, +based on the tone of a reference audio track. Existing state-of-the-art V2C +models break the phonemes in the script according to the divisions between +video frames, which solves the temporal alignment problem but leads to +incomplete phoneme pronunciation and poor identity stability. To address this +problem, we propose StyleDubber, which switches dubbing learning from the frame +level to phoneme level. It contains three main components: (1) A multimodal +style adaptor operating at the phoneme level to learn pronunciation style from +the reference audio, and generate intermediate representations informed by the +facial emotion presented in the video; (2) An utterance-level style learning +module, which guides both the mel-spectrogram decoding and the refining +processes from the intermediate embeddings to improve the overall style +expression; And (3) a phoneme-guided lip aligner to maintain lip sync. +Extensive experiments on two of the primary benchmarks, V2C and Grid, +demonstrate the favorable performance of the proposed method as compared to the +current stateof-the-art. The code will be made available at +https://github.com/GalaxyCong/StyleDubber. + +
+
+
+
+
+ + ♻ ☆ Towards Unsupervised Question Answering System with Multi-level + Summarization for Legal Text + + +
+ This paper summarizes Team SCaLAR's work on SemEval-2024 Task 5: Legal +Argument Reasoning in Civil Procedure. To address this Binary Classification +task, which was daunting due to the complexity of the Legal Texts involved, we +propose a simple yet novel similarity and distance-based unsupervised approach +to generate labels. Further, we explore the Multi-level fusion of Legal-Bert +embeddings using ensemble features, including CNN, GRU, and LSTM. To address +the lengthy nature of Legal explanation in the dataset, we introduce T5-based +segment-wise summarization, which successfully retained crucial information, +enhancing the model's performance. Our unsupervised system witnessed a 20-point +increase in macro F1-score on the development set and a 10-point increase on +the test set, which is promising given its uncomplicated architecture. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ A Curious Case of Searching for the Correlation between Training Data + and Adversarial Robustness of Transformer Textual Models ACL + + +
+ Existing works have shown that fine-tuned textual transformer models achieve +state-of-the-art prediction performances but are also vulnerable to adversarial +text perturbations. Traditional adversarial evaluation is often done +\textit{only after} fine-tuning the models and ignoring the training data. In +this paper, we want to prove that there is also a strong correlation between +training data and model robustness. To this end, we extract 13 different +features representing a wide range of input fine-tuning corpora properties and +use them to predict the adversarial robustness of the fine-tuned models. +Focusing mostly on encoder-only transformer models BERT and RoBERTa with +additional results for BART, ELECTRA, and GPT2, we provide diverse evidence to +support our argument. First, empirical analyses show that (a) extracted +features can be used with a lightweight classifier such as Random Forest to +predict the attack success rate effectively, and (b) features with the most +influence on the model robustness have a clear correlation with the robustness. +Second, our framework can be used as a fast and effective additional tool for +robustness evaluation since it (a) saves 30x-193x runtime compared to the +traditional technique, (b) is transferable across models, (c) can be used under +adversarial training, and (d) robust to statistical randomness. Our code is +publicly available at \url{https://github.com/CaptainCuong/RobustText_ACL2024}. + +
+
+ comment: Accepted to ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ Linear Alignment: A Closed-form Solution for Aligning Human Preferences + without Tuning and Feedback ICML2024 + + +
+ The success of AI assistants based on Language Models (LLMs) hinges on +Reinforcement Learning from Human Feedback (RLHF) to comprehend and align with +user intentions. However, traditional alignment algorithms, such as PPO, are +hampered by complex annotation and training requirements. This reliance limits +the applicability of RLHF and hinders the development of professional +assistants tailored to diverse human preferences. In this work, we introduce +\textit{Linear Alignment}, a novel algorithm that aligns language models with +human preferences in one single inference step, eliminating the reliance on +data annotation and model training. Linear alignment incorporates a new +parameterization for policy optimization under divergence constraints, which +enables the extraction of optimal policy in a closed-form manner and +facilitates the direct estimation of the aligned response. Extensive +experiments on both general and personalized preference datasets demonstrate +that linear alignment significantly enhances the performance and efficiency of +LLM alignment across diverse scenarios. Our code and dataset is published on +\url{https://github.com/Wizardcoast/Linear_Alignment.git}. + +
+
+ comment: Accepted by ICML2024, I'm still preparing a better vision +
+
+
+
+
+ + ♻ ☆ Judging the Judges: Evaluating Alignment and Vulnerabilities in + LLMs-as-Judges + + +
+ Offering a promising solution to the scalability challenges associated with +human evaluation, the LLM-as-a-judge paradigm is rapidly gaining traction as an +approach to evaluating large language models (LLMs). However, there are still +many open questions about the strengths and weaknesses of this paradigm, and +what potential biases it may hold. In this paper, we present a comprehensive +study of the performance of various LLMs acting as judges. We leverage TriviaQA +as a benchmark for assessing objective knowledge reasoning of LLMs and evaluate +them alongside human annotations which we found to have a high inter-annotator +agreement. Our study includes 9 judge models and 9 exam taker models -- both +base and instruction-tuned. We assess the judge model's alignment across +different model sizes, families, and judge prompts. Among other results, our +research rediscovers the importance of using Cohen's kappa as a metric of +alignment as opposed to simple percent agreement, showing that judges with high +percent agreement can still assign vastly different scores. We find that both +Llama-3 70B and GPT-4 Turbo have an excellent alignment with humans, but in +terms of ranking exam taker models, they are outperformed by both JudgeLM-7B +and the lexical judge Contains, which have up to 34 points lower human +alignment. Through error analysis and various other studies, including the +effects of instruction length and leniency bias, we hope to provide valuable +lessons for using LLMs as judges in the future. + +
+
+
+
+
+ + ♻ ☆ Prompt-based Pseudo-labeling Strategy for Sample-Efficient + Semi-Supervised Extractive Summarization + + +
+ Semi-supervised learning (SSL) is a widely used technique in scenarios where +labeled data is scarce and unlabeled data is abundant. While SSL is popular for +image and text classification, it is relatively underexplored for the task of +extractive text summarization. Standard SSL methods follow a teacher-student +paradigm to first train a classification model and then use the classifier's +confidence values to select pseudo-labels for the subsequent training cycle; +however, such classifiers are not suitable to measure the accuracy of +pseudo-labels as they lack specific tuning for evaluation, which leads to +confidence values that fail to capture the semantics and correctness of the +generated summary. To address this problem, we propose a prompt-based +pseudo-labeling strategy with LLMs that picks unlabeled examples with more +accurate pseudo-labels than using just the classifier's probability outputs. +Our approach also includes a relabeling mechanism that improves the quality of +pseudo-labels. We evaluate our method on three text summarization datasets: +TweetSumm, WikiHow, and ArXiv/PubMed. We empirically show that a +prompting-based LLM that scores and generates pseudo-labels outperforms +existing SSL methods on ROUGE-1, ROUGE-2, and ROUGE-L scores on all the +datasets. Furthermore, our method achieves competitive L-Eval scores +(evaluation with LLaMa-3) as a fully supervised method in a data-scarce setting +and outperforms fully supervised method in a data-abundant setting. + +
+
+ comment: 8 pages, 6 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Rethinking Machine Unlearning for Large Language Models + + +
+ We explore machine unlearning (MU) in the domain of large language models +(LLMs), referred to as LLM unlearning. This initiative aims to eliminate +undesirable data influence (e.g., sensitive or illegal information) and the +associated model capabilities, while maintaining the integrity of essential +knowledge generation and not affecting causally unrelated information. We +envision LLM unlearning becoming a pivotal element in the life-cycle management +of LLMs, potentially standing as an essential foundation for developing +generative AI that is not only safe, secure, and trustworthy, but also +resource-efficient without the need of full retraining. We navigate the +unlearning landscape in LLMs from conceptual formulation, methodologies, +metrics, and applications. In particular, we highlight the often-overlooked +aspects of existing LLM unlearning research, e.g., unlearning scope, data-model +interaction, and multifaceted efficacy assessment. We also draw connections +between LLM unlearning and related areas such as model editing, influence +functions, model explanation, adversarial training, and reinforcement learning. +Furthermore, we outline an effective assessment framework for LLM unlearning +and explore its applications in copyright and privacy safeguards and +sociotechnical harm reduction. + +
+
+
+
+
+ + ♻ ☆ MuirBench: A Comprehensive Benchmark for Robust Multi-image + Understanding + + +
+ We introduce MuirBench, a comprehensive benchmark that focuses on robust +multi-image understanding capabilities of multimodal LLMs. MuirBench consists +of 12 diverse multi-image tasks (e.g., scene understanding, ordering) that +involve 10 categories of multi-image relations (e.g., multiview, temporal +relations). Comprising 11,264 images and 2,600 multiple-choice questions, +MuirBench is created in a pairwise manner, where each standard instance is +paired with an unanswerable variant that has minimal semantic differences, in +order for a reliable assessment. Evaluated upon 20 recent multi-modal LLMs, our +results reveal that even the best-performing models like GPT-4o and Gemini Pro +find it challenging to solve MuirBench, achieving 68.0% and 49.3% in accuracy. +Open-source multimodal LLMs trained on single images can hardly generalize to +multi-image questions, hovering below 33.3% in accuracy. These results +highlight the importance of MuirBench in encouraging the community to develop +multimodal LLMs that can look beyond a single image, suggesting potential +pathways for future improvements. + +
+
+ comment: typos corrected, references added, Project Page: + https://muirbench.github.io/ +
+
+
+
+
+ + ♻ ☆ AutoRT: Embodied Foundation Models for Large Scale Orchestration of + Robotic Agents ICRA 2024 + + +
+ Foundation models that incorporate language, vision, and more recently +actions have revolutionized the ability to harness internet scale data to +reason about useful tasks. However, one of the key challenges of training +embodied foundation models is the lack of data grounded in the physical world. +In this paper, we propose AutoRT, a system that leverages existing foundation +models to scale up the deployment of operational robots in completely unseen +scenarios with minimal human supervision. AutoRT leverages vision-language +models (VLMs) for scene understanding and grounding, and further uses large +language models (LLMs) for proposing diverse and novel instructions to be +performed by a fleet of robots. Guiding data collection by tapping into the +knowledge of foundation models enables AutoRT to effectively reason about +autonomy tradeoffs and safety while significantly scaling up data collection +for robot learning. We demonstrate AutoRT proposing instructions to over 20 +robots across multiple buildings and collecting 77k real robot episodes via +both teleoperation and autonomous robot policies. We experimentally show that +such "in-the-wild" data collected by AutoRT is significantly more diverse, and +that AutoRT's use of LLMs allows for instruction following data collection +robots that can align to human preferences. + +
+
+ comment: 26 pages, 9 figures, ICRA 2024 VLMNM Workshop +
+
+
+
+
+ + ♻ ☆ Knowledge of Knowledge: Exploring Known-Unknowns Uncertainty with Large + Language Models + + +
+ This paper investigates the capabilities of Large Language Models (LLMs) in +the context of understanding their knowledge and uncertainty over questions. +Specifically, we focus on addressing known-unknown questions, characterized by +high uncertainty due to the absence of definitive answers. To facilitate our +study, we collect a new dataset with Known-Unknown Questions (KUQ) and +establish a categorization framework to clarify the origins of uncertainty in +such queries. Subsequently, we examine the performance of open-source LLMs, +fine-tuned using this dataset, in distinguishing between known and unknown +queries within open-ended question-answering scenarios. The fine-tuned models +demonstrated a significant improvement, achieving a considerable increase in +F1-score relative to their pre-fine-tuning state. Through a comprehensive +analysis, we reveal insights into the models' improved uncertainty articulation +and their consequent efficacy in multi-agent debates. These findings help us +understand how LLMs can be trained to identify and express uncertainty, +improving our knowledge of how they understand and express complex or unclear +information. + +
+
+
+
+
+ + ♻ ☆ GlotLID: Language Identification for Low-Resource Languages EMNLP 2023 + + +
+ Several recent papers have published good solutions for language +identification (LID) for about 300 high-resource and medium-resource languages. +However, there is no LID available that (i) covers a wide range of low-resource +languages, (ii) is rigorously evaluated and reliable and (iii) efficient and +easy to use. Here, we publish GlotLID-M, an LID model that satisfies the +desiderata of wide coverage, reliability and efficiency. It identifies 1665 +languages, a large increase in coverage compared to prior work. In our +experiments, GlotLID-M outperforms four baselines (CLD3, FT176, OpenLID and +NLLB) when balancing F1 and false positive rate (FPR). We analyze the unique +challenges that low-resource LID poses: incorrect corpus metadata, leakage from +high-resource languages, difficulty separating closely related languages, +handling of macrolanguage vs varieties and in general noisy data. We hope that +integrating GlotLID-M into dataset creation pipelines will improve quality and +enhance accessibility of NLP technology for low-resource languages and +cultures. GlotLID-M model (including future versions), code, and list of data +sources are available: https://github.com/cisnlp/GlotLID. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Guylingo: The Republic of Guyana Creole Corpora NAACL 2024 + + +
+ While major languages often enjoy substantial attention and resources, the +linguistic diversity across the globe encompasses a multitude of smaller, +indigenous, and regional languages that lack the same level of computational +support. One such region is the Caribbean. While commonly labeled as "English +speaking", the ex-British Caribbean region consists of a myriad of Creole +languages thriving alongside English. In this paper, we present Guylingo: a +comprehensive corpus designed for advancing NLP research in the domain of +Creolese (Guyanese English-lexicon Creole), the most widely spoken language in +the culturally rich nation of Guyana. We first outline our framework for +gathering and digitizing this diverse corpus, inclusive of colloquial +expressions, idioms, and regional variations in a low-resource language. We +then demonstrate the challenges of training and evaluating NLP models for +machine translation in Creole. Lastly, we discuss the unique opportunities +presented by recent NLP advancements for accelerating the formal adoption of +Creole languages as official languages in the Caribbean. + +
+
+ comment: Accepted to NAACL 2024 Main Conference Special Theme Track: Languages + of Latin America and The Caribbean +
+
+
+
+
+ + ♻ ☆ Hypernetworks for Personalizing ASR to Atypical Speech + + +
+ Parameter-efficient fine-tuning (PEFT) for personalizing automatic speech +recognition (ASR) has recently shown promise for adapting general population +models to atypical speech. However, these approaches assume a priori knowledge +of the atypical speech disorder being adapted for -- the diagnosis of which +requires expert knowledge that is not always available. Even given this +knowledge, data scarcity and high inter/intra-speaker variability further limit +the effectiveness of traditional fine-tuning. To circumvent these challenges, +we first identify the minimal set of model parameters required for ASR +adaptation. Our analysis of each individual parameter's effect on adaptation +performance allows us to reduce Word Error Rate (WER) by half while adapting +0.03% of all weights. Alleviating the need for cohort-specific models, we next +propose the novel use of a meta-learned hypernetwork to generate highly +individualized, utterance-level adaptations on-the-fly for a diverse set of +atypical speech characteristics. Evaluating adaptation at the global, cohort +and individual-level, we show that hypernetworks generalize better to +out-of-distribution speakers, while maintaining an overall relative WER +reduction of 75.2% using 0.1% of the full parameter budget. + +
+
+
+
+
+ + ♻ ☆ An Examination on the Effectiveness of Divide-and-Conquer Prompting in + Large Language Models + + +
+ Foundation models, such as Large language Models (LLMs), have attracted +significant amount of interest due to their large number of applications. +However, when handling tasks involving repetitive sub-tasks and/or deceptive +contents, such as arithmetic calculation and article-level fake news detection, +simple instructional prompts suffer from inaccurate responses. Existing works +show that more complicated prompting strategies, such as Chain-of-Thoughts and +Least-to-Most, can unlock LLM's powerful capacity in diverse areas. Recent +researches reveal that simple divide-and-conquer prompting strategy, i.e. +simply dividing the input sequence to multiple sub-inputs, can also +substantially improve LLM's performance in some specific tasks such as +misinformation detection. In this paper, we aim at examining the utility of +divide-and-conquer prompting strategy and answer on which kind of tasks this +strategy gets advantages. Specifically, we provide a theoretic analysis to +divide-and-conquer prompting strategy and help us identify the specific tasks +where DaC prompting can bring performance boost with theoretic guarantee. We +then present two cases (large integer arithmetic and fact verification) where +experimental results aligns with our theoretic analysis. + +
+
+ comment: Preprint +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Magic Insert: Style-Aware Drag-and-Drop + + +
+ We present Magic Insert, a method for dragging-and-dropping subjects from a +user-provided image into a target image of a different style in a physically +plausible manner while matching the style of the target image. This work +formalizes the problem of style-aware drag-and-drop and presents a method for +tackling it by addressing two sub-problems: style-aware personalization and +realistic object insertion in stylized images. For style-aware personalization, +our method first fine-tunes a pretrained text-to-image diffusion model using +LoRA and learned text tokens on the subject image, and then infuses it with a +CLIP representation of the target style. For object insertion, we use +Bootstrapped Domain Adaption to adapt a domain-specific photorealistic object +insertion model to the domain of diverse artistic styles. Overall, the method +significantly outperforms traditional approaches such as inpainting. Finally, +we present a dataset, SubjectPlop, to facilitate evaluation and future progress +in this area. Project page: https://magicinsert.github.io/ + +
+
+ comment: Project page: https://magicinsert.github.io/ +
+
+
+
+
+ + ☆ Characterizing the Interpretability of Attention Maps in Digital + Pathology + + +
+ Interpreting machine learning model decisions is crucial for high-risk +applications like healthcare. In digital pathology, large whole slide images +(WSIs) are decomposed into smaller tiles and tile-derived features are +processed by attention-based multiple instance learning (ABMIL) models to +predict WSI-level labels. These networks generate tile-specific attention +weights, which can be visualized as attention maps for interpretability. +However, a standardized evaluation framework for these maps is lacking, +questioning their reliability and ability to detect spurious correlations that +can mislead models. We herein propose a framework to assess the ability of +attention networks to attend to relevant features in digital pathology by +creating artificial model confounders and using dedicated interpretability +metrics. Models are trained and evaluated on data with tile modifications +correlated with WSI labels, enabling the analysis of model sensitivity to +artificial confounders and the accuracy of attention maps in highlighting them. +Confounders are introduced either through synthetic tile modifications or +through tile ablations based on their specific image-based features, with the +latter being used to assess more clinically relevant scenarios. We also analyze +the impact of varying confounder quantities at both the tile and WSI levels. +Our results show that ABMIL models perform as desired within our framework. +While attention maps generally highlight relevant regions, their robustness is +affected by the type and number of confounders. Our versatile framework has the +potential to be used in the evaluation of various methods and the exploration +of image-based features driving model predictions, which could aid in biomarker +discovery. + +
+
+
+
+
+ + ☆ Boosting Consistency in Story Visualization with Rich-Contextual + Conditional Diffusion Models + + +
+ Recent research showcases the considerable potential of conditional diffusion +models for generating consistent stories. However, current methods, which +predominantly generate stories in an autoregressive and excessively +caption-dependent manner, often underrate the contextual consistency and +relevance of frames during sequential generation. To address this, we propose a +novel Rich-contextual Conditional Diffusion Models (RCDMs), a two-stage +approach designed to enhance story generation's semantic consistency and +temporal consistency. Specifically, in the first stage, the frame-prior +transformer diffusion model is presented to predict the frame semantic +embedding of the unknown clip by aligning the semantic correlations between the +captions and frames of the known clip. The second stage establishes a robust +model with rich contextual conditions, including reference images of the known +clip, the predicted frame semantic embedding of the unknown clip, and text +embeddings of all captions. By jointly injecting these rich contextual +conditions at the image and feature levels, RCDMs can generate semantic and +temporal consistency stories. Moreover, RCDMs can generate consistent stories +with a single forward inference compared to autoregressive models. Our +qualitative and quantitative results demonstrate that our proposed RCDMs +outperform in challenging scenarios. The code and model will be available at +https://github.com/muzishen/RCDMs. + +
+
+
+
+
+ + ☆ Understanding Alignment in Multimodal LLMs: A Comprehensive Study + + +
+ Preference alignment has become a crucial component in enhancing the +performance of Large Language Models (LLMs), yet its impact in Multimodal Large +Language Models (MLLMs) remains comparatively underexplored. Similar to +language models, MLLMs for image understanding tasks encounter challenges like +hallucination. In MLLMs, hallucination can occur not only by stating incorrect +facts but also by producing responses that are inconsistent with the image +content. A primary objective of alignment for MLLMs is to encourage these +models to align responses more closely with image information. Recently, +multiple works have introduced preference datasets for MLLMs and examined +different alignment methods, including Direct Preference Optimization (DPO) and +Proximal Policy Optimization (PPO). However, due to variations in datasets, +base model types, and alignment methods, it remains unclear which specific +elements contribute most significantly to the reported improvements in these +works. In this paper, we independently analyze each aspect of preference +alignment in MLLMs. We start by categorizing the alignment algorithms into two +groups, offline (such as DPO), and online (such as online-DPO), and show that +combining offline and online methods can improve the performance of the model +in certain scenarios. We review a variety of published multimodal preference +datasets and discuss how the details of their construction impact model +performance. Based on these insights, we introduce a novel way of creating +multimodal preference data called Bias-Driven Hallucination Sampling (BDHS) +that needs neither additional annotation nor external models, and show that it +can achieve competitive performance to previously published alignment work for +multimodal models across a range of benchmarks. + +
+
+
+
+
+ + ☆ SUPER: Seated Upper Body Pose Estimation using mmWave Radars + + +
+ In industrial countries, adults spend a considerable amount of time sedentary +each day at work, driving and during activities of daily living. Characterizing +the seated upper body human poses using mmWave radars is an important, yet +under-studied topic with many applications in human-machine interaction, +transportation and road safety. In this work, we devise SUPER, a framework for +seated upper body human pose estimation that utilizes dual-mmWave radars in +close proximity. A novel masking algorithm is proposed to coherently fuse data +from the radars to generate intensity and Doppler point clouds with +complementary information for high-motion but small radar cross section areas +(e.g., upper extremities) and low-motion but large RCS areas (e.g. torso). A +lightweight neural network extracts both global and local features of upper +body and output pose parameters for the Skinned Multi-Person Linear (SMPL) +model. Extensive leave-one-subject-out experiments on various motion sequences +from multiple subjects show that SUPER outperforms a state-of-the-art baseline +method by 30 -- 184%. We also demonstrate its utility in a simple downstream +task for hand-object interaction. + +
+
+
+
+
+ + ☆ Meta 3D AssetGen: Text-to-Mesh Generation with High-Quality Geometry, + Texture, and PBR Materials + + +
+ We present Meta 3D AssetGen (AssetGen), a significant advancement in +text-to-3D generation which produces faithful, high-quality meshes with texture +and material control. Compared to works that bake shading in the 3D object's +appearance, AssetGen outputs physically-based rendering (PBR) materials, +supporting realistic relighting. AssetGen generates first several views of the +object with factored shaded and albedo appearance channels, and then +reconstructs colours, metalness and roughness in 3D, using a deferred shading +loss for efficient supervision. It also uses a sign-distance function to +represent 3D shape more reliably and introduces a corresponding loss for direct +shape supervision. This is implemented using fused kernels for high memory +efficiency. After mesh extraction, a texture refinement transformer operating +in UV space significantly improves sharpness and details. AssetGen achieves 17% +improvement in Chamfer Distance and 40% in LPIPS over the best concurrent work +for few-view reconstruction, and a human preference of 72% over the best +industry competitors of comparable speed, including those that support PBR. +Project page with generated assets: https://assetgen.github.io + +
+
+ comment: Project Page: https://assetgen.github.io +
+
+
+
+
+ + ☆ Predicting Visual Attention in Graphic Design Documents + + +
+ We present a model for predicting visual attention during the free viewing of +graphic design documents. While existing works on this topic have aimed at +predicting static saliency of graphic designs, our work is the first attempt to +predict both spatial attention and dynamic temporal order in which the document +regions are fixated by gaze using a deep learning based model. We propose a +two-stage model for predicting dynamic attention on such documents, with +webpages being our primary choice of document design for demonstration. In the +first stage, we predict the saliency maps for each of the document components +(e.g. logos, banners, texts, etc. for webpages) conditioned on the type of +document layout. These component saliency maps are then jointly used to predict +the overall document saliency. In the second stage, we use these +layout-specific component saliency maps as the state representation for an +inverse reinforcement learning model of fixation scanpath prediction during +document viewing. To test our model, we collected a new dataset consisting of +eye movements from 41 people freely viewing 450 webpages (the largest dataset +of its kind). Experimental results show that our model outperforms existing +models in both saliency and scanpath prediction for webpages, and also +generalizes very well to other graphic design documents such as comics, +posters, mobile UIs, etc. and natural images. + +
+
+
+
+
+ + ☆ Parameter Matching Attack: Enhancing Practical Applicability of + Availability Attacks + + +
+ The widespread use of personal data for training machine learning models +raises significant privacy concerns, as individuals have limited control over +how their public data is subsequently utilized. Availability attacks have +emerged as a means for data owners to safeguard their data by desning +imperceptible perturbations that degrade model performance when incorporated +into training datasets. However, existing availability attacks exhibit +limitations in practical applicability, particularly when only a portion of the +data can be perturbed. To address this challenge, we propose a novel +availability attack approach termed Parameter Matching Attack (PMA). PMA is the +first availability attack that works when only a portion of data can be +perturbed. PMA optimizes perturbations so that when the model is trained on a +mixture of clean and perturbed data, the resulting model will approach a model +designed to perform poorly. Experimental results across four datasets +demonstrate that PMA outperforms existing methods, achieving significant model +performance degradation when a part of the training data is perturbed. Our code +is available in the supplementary. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Meta 3D TextureGen: Fast and Consistent Texture Generation for 3D + Objects + + +
+ The recent availability and adaptability of text-to-image models has sparked +a new era in many related domains that benefit from the learned text priors as +well as high-quality and fast generation capabilities, one of which is texture +generation for 3D objects. Although recent texture generation methods achieve +impressive results by using text-to-image networks, the combination of global +consistency, quality, and speed, which is crucial for advancing texture +generation to real-world applications, remains elusive. To that end, we +introduce Meta 3D TextureGen: a new feedforward method comprised of two +sequential networks aimed at generating high-quality and globally consistent +textures for arbitrary geometries of any complexity degree in less than 20 +seconds. Our method achieves state-of-the-art results in quality and speed by +conditioning a text-to-image model on 3D semantics in 2D space and fusing them +into a complete and high-resolution UV texture map, as demonstrated by +extensive qualitative and quantitative evaluations. In addition, we introduce a +texture enhancement network that is capable of up-scaling any texture by an +arbitrary ratio, producing 4k pixel resolution textures. + +
+
+
+
+
+ + ☆ Close, But Not There: Boosting Geographic Distance Sensitivity in Visual + Place Recognition + + +
+ Visual Place Recognition (VPR) plays a critical role in many localization and +mapping pipelines. It consists of retrieving the closest sample to a query +image, in a certain embedding space, from a database of geotagged references. +The image embedding is learned to effectively describe a place despite +variations in visual appearance, viewpoint, and geometric changes. In this +work, we formulate how limitations in the Geographic Distance Sensitivity of +current VPR embeddings result in a high probability of incorrectly sorting the +top-k retrievals, negatively impacting the recall. In order to address this +issue in single-stage VPR, we propose a novel mining strategy, CliqueMining, +that selects positive and negative examples by sampling cliques from a graph of +visually similar images. Our approach boosts the sensitivity of VPR embeddings +at small distance ranges, significantly improving the state of the art on +relevant benchmarks. In particular, we raise recall@1 from 75% to 82% in MSLS +Challenge, and from 76% to 90% in Nordland. Models and code are available at +https://github.com/serizba/cliquemining. + +
+
+
+
+
+ + ☆ AXIAL: Attention-based eXplainability for Interpretable Alzheimer's + Localized Diagnosis using 2D CNNs on 3D MRI brain scans + + +
+ This study presents an innovative method for Alzheimer's disease diagnosis +using 3D MRI designed to enhance the explainability of model decisions. Our +approach adopts a soft attention mechanism, enabling 2D CNNs to extract +volumetric representations. At the same time, the importance of each slice in +decision-making is learned, allowing the generation of a voxel-level attention +map to produces an explainable MRI. To test our method and ensure the +reproducibility of our results, we chose a standardized collection of MRI data +from the Alzheimer's Disease Neuroimaging Initiative (ADNI). On this dataset, +our method significantly outperforms state-of-the-art methods in (i) +distinguishing AD from cognitive normal (CN) with an accuracy of 0.856 and +Matthew's correlation coefficient (MCC) of 0.712, representing improvements of +2.4\% and 5.3\% respectively over the second-best, and (ii) in the prognostic +task of discerning stable from progressive mild cognitive impairment (MCI) with +an accuracy of 0.725 and MCC of 0.443, showing improvements of 10.2\% and +20.5\% respectively over the second-best. We achieved this prognostic result by +adopting a double transfer learning strategy, which enhanced sensitivity to +morphological changes and facilitated early-stage AD detection. With +voxel-level precision, our method identified which specific areas are being +paid attention to, identifying these predominant brain regions: the +\emph{hippocampus}, the \emph{amygdala}, the \emph{parahippocampal}, and the +\emph{inferior lateral ventricles}. All these areas are clinically associated +with AD development. Furthermore, our approach consistently found the same +AD-related areas across different cross-validation folds, proving its +robustness and precision in highlighting areas that align closely with known +pathological markers of the disease. + +
+
+ comment: 21 pages, 9 figures, 9 tables +
+
+
+
+
+ + ☆ Video Watermarking: Safeguarding Your Video from (Unauthorized) + Annotations by Video-based LLMs + + +
+ The advent of video-based Large Language Models (LLMs) has significantly +enhanced video understanding. However, it has also raised some safety concerns +regarding data protection, as videos can be more easily annotated, even without +authorization. This paper introduces Video Watermarking, a novel technique to +protect videos from unauthorized annotations by such video-based LLMs, +especially concerning the video content and description, in response to +specific queries. By imperceptibly embedding watermarks into key video frames +with multi-modal flow-based losses, our method preserves the viewing experience +while preventing misuse by video-based LLMs. Extensive experiments show that +Video Watermarking significantly reduces the comprehensibility of videos with +various video-based LLMs, demonstrating both stealth and robustness. In +essence, our method provides a solution for securing video content, ensuring +its integrity and confidentiality in the face of evolving video-based LLMs +technologies. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2403.13507 +
+
+
+
+
+ + ☆ Tiny-PULP-Dronets: Squeezing Neural Networks for Faster and Lighter + Inference on Multi-Tasking Autonomous Nano-Drones + + +
+ Pocket-sized autonomous nano-drones can revolutionize many robotic use cases, +such as visual inspection in narrow, constrained spaces, and ensure safer +human-robot interaction due to their tiny form factor and weight -- i.e., tens +of grams. This compelling vision is challenged by the high level of +intelligence needed aboard, which clashes against the limited computational and +storage resources available on PULP (parallel-ultra-low-power) MCU class +navigation and mission controllers that can be hosted aboard. This work moves +from PULP-Dronet, a State-of-the-Art convolutional neural network for +autonomous navigation on nano-drones. We introduce Tiny-PULP-Dronet: a novel +methodology to squeeze by more than one order of magnitude model size (50x +fewer parameters), and number of operations (27x less multiply-and-accumulate) +required to run inference with similar flight performance as PULP-Dronet. This +massive reduction paves the way towards affordable multi-tasking on +nano-drones, a fundamental requirement for achieving high-level intelligence. + +
+
+ comment: 3 Figures, 1 table. Accepted for publication at IEEE Artificial + Intelligence Circuits and Systems (AICAS), 2022 +
+
+
+
+
+ + ☆ Face Reconstruction Transfer Attack as Out-of-Distribution + Generalization ECCV2024 + + +
+ Understanding the vulnerability of face recognition systems to malicious +attacks is of critical importance. Previous works have focused on +reconstructing face images that can penetrate a targeted verification system. +Even in the white-box scenario, however, naively reconstructed images +misrepresent the identity information, hence the attacks are easily neutralized +once the face system is updated or changed. In this paper, we aim to +reconstruct face images which are capable of transferring face attacks on +unseen encoders. We term this problem as Face Reconstruction Transfer Attack +(FRTA) and show that it can be formulated as an out-of-distribution (OOD) +generalization problem. Inspired by its OOD nature, we propose to solve FRTA by +Averaged Latent Search and Unsupervised Validation with pseudo target (ALSUV). +To strengthen the reconstruction attack on OOD unseen encoders, ALSUV +reconstructs the face by searching the latent of amortized generator StyleGAN2 +through multiple latent optimization, latent optimization trajectory averaging, +and unsupervised validation with a pseudo target. We demonstrate the efficacy +and generalization of our method on widely used face datasets, accompanying it +with extensive ablation studies and visually, qualitatively, and quantitatively +analyses. The source code will be released. + +
+
+ comment: Accepted to ECCV2024 +
+
+
+
+
+ + ☆ Consistency Flow Matching: Defining Straight Flows with Velocity + Consistency + + +
+ Flow matching (FM) is a general framework for defining probability paths via +Ordinary Differential Equations (ODEs) to transform between noise and data +samples. Recent approaches attempt to straighten these flow trajectories to +generate high-quality samples with fewer function evaluations, typically +through iterative rectification methods or optimal transport solutions. In this +paper, we introduce Consistency Flow Matching (Consistency-FM), a novel FM +method that explicitly enforces self-consistency in the velocity field. +Consistency-FM directly defines straight flows starting from different times to +the same endpoint, imposing constraints on their velocity values. Additionally, +we propose a multi-segment training approach for Consistency-FM to enhance +expressiveness, achieving a better trade-off between sampling quality and +speed. Preliminary experiments demonstrate that our Consistency-FM +significantly improves training efficiency by converging 4.4x faster than +consistency models and 1.7x faster than rectified flow models while achieving +better generation quality. Our code is available at: +https://github.com/YangLing0818/consistency_flow_matching + +
+
+ comment: Code: https://github.com/YangLing0818/consistency_flow_matching +
+
+
+
+
+ + ☆ Similarity Distance-Based Label Assignment for Tiny Object Detection + + +
+ Tiny object detection is becoming one of the most challenging tasks in +computer vision because of the limited object size and lack of information. The +label assignment strategy is a key factor affecting the accuracy of object +detection. Although there are some effective label assignment strategies for +tiny objects, most of them focus on reducing the sensitivity to the bounding +boxes to increase the number of positive samples and have some fixed +hyperparameters need to set. However, more positive samples may not necessarily +lead to better detection results, in fact, excessive positive samples may lead +to more false positives. In this paper, we introduce a simple but effective +strategy named the Similarity Distance (SimD) to evaluate the similarity +between bounding boxes. This proposed strategy not only considers both location +and shape similarity but also learns hyperparameters adaptively, ensuring that +it can adapt to different datasets and various object sizes in a dataset. Our +approach can be simply applied in common anchor-based detectors in place of the +IoU for label assignment and Non Maximum Suppression (NMS). Extensive +experiments on four mainstream tiny object detection datasets demonstrate +superior performance of our method, especially, 1.8 AP points and 4.1 AP points +of very tiny higher than the state-of-the-art competitors on AI-TOD. Code is +available at: \url{https://github.com/cszzshi/SimD}. + +
+
+ comment: 8 pages, 4 figures, 6 tables +
+
+
+
+
+ + ☆ TokenPacker: Efficient Visual Projector for Multimodal LLM + + +
+ The visual projector serves as an essential bridge between the visual encoder +and the Large Language Model (LLM) in a Multimodal LLM (MLLM). Typically, MLLMs +adopt a simple MLP to preserve all visual contexts via one-to-one +transformation. However, the visual tokens are redundant and can be +considerably increased when dealing with high-resolution images, impairing the +efficiency of MLLMs significantly. Some recent works have introduced resampler +or abstractor to reduce the number of resulting visual tokens. Unfortunately, +they fail to capture finer details and undermine the visual reasoning +capabilities of MLLMs. In this work, we propose a novel visual projector, which +adopts a coarse-to-fine scheme to inject the enriched characteristics to +generate the condensed visual tokens. In specific, we first interpolate the +visual features as a low-resolution point query, providing the overall visual +representation as the foundation. Then, we introduce a region-to-point +injection module that utilizes high-resolution, multi-level region-based cues +as fine-grained reference keys and values, allowing them to be fully absorbed +within the corresponding local context region. This step effectively updates +the coarse point query, transforming it into an enriched one for the subsequent +LLM reasoning. Extensive experiments demonstrate that our approach compresses +the visual tokens by 75%~89%, while achieves comparable or even better +performance across diverse benchmarks with significantly higher efficiency. The +source codes can be found at https://github.com/CircleRadon/TokenPacker. + +
+
+ comment: 16 pages, Codes:https://github.com/CircleRadon/TokenPacker +
+
+
+
+
+ + ☆ SafaRi:Adaptive Sequence Transformer for Weakly Supervised Referring + Expression Segmentation ECCV 2024 + + +
+ Referring Expression Segmentation (RES) aims to provide a segmentation mask +of the target object in an image referred to by the text (i.e., referring +expression). Existing methods require large-scale mask annotations. Moreover, +such approaches do not generalize well to unseen/zero-shot scenarios. To +address the aforementioned issues, we propose a weakly-supervised bootstrapping +architecture for RES with several new algorithmic innovations. To the best of +our knowledge, ours is the first approach that considers only a fraction of +both mask and box annotations (shown in Figure 1 and Table 1) for training. To +enable principled training of models in such low-annotation settings, improve +image-text region-level alignment, and further enhance spatial localization of +the target object in the image, we propose Cross-modal Fusion with Attention +Consistency module. For automatic pseudo-labeling of unlabeled samples, we +introduce a novel Mask Validity Filtering routine based on a spatially aware +zero-shot proposal scoring approach. Extensive experiments show that with just +30% annotations, our model SafaRi achieves 59.31 and 48.26 mIoUs as compared to +58.93 and 48.19 mIoUs obtained by the fully-supervised SOTA method SeqTR +respectively on RefCOCO+@testA and RefCOCO+testB datasets. SafaRi also +outperforms SeqTR by 11.7% (on RefCOCO+testA) and 19.6% (on RefCOCO+testB) in a +fully-supervised setting and demonstrates strong generalization capabilities in +unseen/zero-shot tasks. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ Real HSI-MSI-PAN image dataset for the + hyperspectral/multi-spectral/panchromatic image fusion and super-resolution + fields + + +
+ Nowadays, most of the hyperspectral image (HSI) fusion experiments are based +on simulated datasets to compare different fusion methods. However, most of the +spectral response functions and spatial downsampling functions used to create +the simulated datasets are not entirely accurate, resulting in deviations in +spatial and spectral features between the generated images for fusion and the +real images for fusion. This reduces the credibility of the fusion algorithm, +causing unfairness in the comparison between different algorithms and hindering +the development of the field of hyperspectral image fusion. Therefore, we +release a real HSI/MSI/PAN image dataset to promote the development of the +field of hyperspectral image fusion. These three images are spatially +registered, meaning fusion can be performed between HSI and MSI, HSI and PAN +image, MSI and PAN image, as well as among HSI, MSI, and PAN image. This real +dataset could be available at https://aistudio.baidu.com/datasetdetail/281612. +The related code to process the data could be available at +https://github.com/rs-lsl/CSSNet. + +
+
+
+
+
+ + ☆ OpenSlot: Mixed Open-set Recognition with Object-centric Learning + + +
+ Existing open-set recognition (OSR) studies typically assume that each image +contains only one class label, and the unknown test set (negative) has a +disjoint label space from the known test set (positive), a scenario termed +full-label shift. This paper introduces the mixed OSR problem, where test +images contain multiple class semantics, with known and unknown classes +co-occurring in negatives, leading to a more challenging super-label shift. +Addressing the mixed OSR requires classification models to accurately +distinguish different class semantics within images and measure their +"knowness". In this study, we propose the OpenSlot framework, built upon +object-centric learning. OpenSlot utilizes slot features to represent diverse +class semantics and produce class predictions. Through our proposed +anti-noise-slot (ANS) technique, we mitigate the impact of noise (invalid and +background) slots during classification training, effectively addressing the +semantic misalignment between class predictions and the ground truth. We +conduct extensive experiments with OpenSlot on mixed & conventional OSR +benchmarks. Without elaborate designs, OpenSlot not only exceeds existing OSR +studies in detecting super-label shifts across single & multi-label mixed OSR +tasks but also achieves state-of-the-art performance on conventional +benchmarks. Remarkably, our method can localize class objects without using +bounding boxes during training. The competitive performance in open-set object +detection demonstrates OpenSlot's ability to explicitly explain label shifts +and benefits in computational efficiency and generalization. + +
+
+ comment: This study is under IEEE TMM review +
+
+
+
+
+ + ☆ OpenVid-1M: A Large-Scale High-Quality Dataset for Text-to-video + Generation + + +
+ Text-to-video (T2V) generation has recently garnered significant attention +thanks to the large multi-modality model Sora. However, T2V generation still +faces two important challenges: 1) Lacking a precise open sourced high-quality +dataset. The previous popular video datasets, e.g. WebVid-10M and Panda-70M, +are either with low quality or too large for most research institutions. +Therefore, it is challenging but crucial to collect a precise high-quality +text-video pairs for T2V generation. 2) Ignoring to fully utilize textual +information. Recent T2V methods have focused on vision transformers, using a +simple cross attention module for video generation, which falls short of +thoroughly extracting semantic information from text prompt. To address these +issues, we introduce OpenVid-1M, a precise high-quality dataset with expressive +captions. This open-scenario dataset contains over 1 million text-video pairs, +facilitating research on T2V generation. Furthermore, we curate 433K 1080p +videos from OpenVid-1M to create OpenVidHD-0.4M, advancing high-definition +video generation. Additionally, we propose a novel Multi-modal Video Diffusion +Transformer (MVDiT) capable of mining both structure information from visual +tokens and semantic information from text tokens. Extensive experiments and +ablation studies verify the superiority of OpenVid-1M over previous datasets +and the effectiveness of our MVDiT. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ☆ Investigating Event-Based Cameras for Video Frame Interpolation in + Sports + + +
+ Slow-motion replays provide a thrilling perspective on pivotal moments within +sports games, offering a fresh and captivating visual experience. However, +capturing slow-motion footage typically demands high-tech, expensive cameras +and infrastructures. Deep learning Video Frame Interpolation (VFI) techniques +have emerged as a promising avenue, capable of generating high-speed footage +from regular camera feeds. Moreover, the utilization of event-based cameras has +recently gathered attention as they provide valuable motion information between +frames, further enhancing the VFI performances. In this work, we present a +first investigation of event-based VFI models for generating sports slow-motion +videos. Particularly, we design and implement a bi-camera recording setup, +including an RGB and an event-based camera to capture sports videos, to +temporally align and spatially register both cameras. Our experimental +validation demonstrates that TimeLens, an off-the-shelf event-based VFI model, +can effectively generate slow-motion footage for sports videos. This first +investigation underscores the practical utility of event-based cameras in +producing sports slow-motion content and lays the groundwork for future +research endeavors in this domain. + +
+
+
+
+
+ + ☆ GCF: Graph Convolutional Networks for Facial Expression Recognition + + +
+ Facial Expression Recognition (FER) is vital for understanding interpersonal +communication. However, existing classification methods often face challenges +such as vulnerability to noise, imbalanced datasets, overfitting, and +generalization issues. In this paper, we propose GCF, a novel approach that +utilizes Graph Convolutional Networks for FER. GCF integrates Convolutional +Neural Networks (CNNs) for feature extraction, using either custom +architectures or pretrained models. The extracted visual features are then +represented on a graph, enhancing local CNN features with global features via a +Graph Convolutional Neural Network layer. We evaluate GCF on benchmark datasets +including CK+, JAFFE, and FERG. The results show that GCF significantly +improves performance over state-of-the-art methods. For example, GCF enhances +the accuracy of ResNet18 from 92% to 98% on CK+, from 66% to 89% on JAFFE, and +from 94% to 100% on FERG. Similarly, GCF improves the accuracy of VGG16 from +89% to 97% on CK+, from 72% to 92% on JAFFE, and from 96% to 99.49% on FERG. We +provide a comprehensive analysis of our approach, demonstrating its +effectiveness in capturing nuanced facial expressions. By integrating graph +convolutions with CNNs, GCF significantly advances FER, offering improved +accuracy and robustness in real-world applications. + +
+
+
+
+
+ + ☆ Enable the Right to be Forgotten with Federated Client Unlearning in + Medical Imaging + + +
+ The right to be forgotten, as stated in most data regulations, poses an +underexplored challenge in federated learning (FL), leading to the development +of federated unlearning (FU). However, current FU approaches often face +trade-offs between efficiency, model performance, forgetting efficacy, and +privacy preservation. In this paper, we delve into the paradigm of Federated +Client Unlearning (FCU) to guarantee a client the right to erase the +contribution or the influence, introducing the first FU framework in medical +imaging. In the unlearning process of a client, the proposed model-contrastive +unlearning marks a pioneering step towards feature-level unlearning, and +frequency-guided memory preservation ensures smooth forgetting of local +knowledge while maintaining the generalizability of the trained global model, +thus avoiding performance compromises and guaranteeing rapid post-training. We +evaluated our FCU framework on two public medical image datasets, including +Intracranial hemorrhage diagnosis and skin lesion diagnosis, demonstrating that +our framework outperformed other state-of-the-art FU frameworks, with an +expected speed-up of 10-15 times compared with retraining from scratch. The +code and the organized datasets can be found at: +https://github.com/dzp2095/FCU. + +
+
+
+
+
+ + ☆ Conceptual Codebook Learning for Vision-Language Models + + +
+ In this paper, we propose Conceptual Codebook Learning (CoCoLe), a novel +fine-tuning method for vision-language models (VLMs) to address the challenge +of improving the generalization capability of VLMs while fine-tuning them on +downstream tasks in a few-shot setting. We recognize that visual concepts, such +as textures, shapes, and colors are naturally transferable across domains and +play a crucial role in generalization tasks. Motivated by this interesting +finding, we learn a conceptual codebook consisting of visual concepts as keys +and conceptual prompts as values, which serves as a link between the image +encoder's outputs and the text encoder's inputs. Specifically, for a given +image, we leverage the codebook to identify the most relevant conceptual +prompts associated with the class embeddings to perform the classification. +Additionally, we incorporate a handcrafted concept cache as a regularization to +alleviate the overfitting issues in low-shot scenarios. We observe that this +conceptual codebook learning method is able to achieve enhanced alignment +between visual and linguistic modalities. Extensive experimental results +demonstrate that our CoCoLe method remarkably outperforms the existing +state-of-the-art methods across various evaluation settings, including +base-to-new generalization, cross-dataset evaluation, and domain generalization +tasks. Detailed ablation studies further confirm the efficacy of each component +in CoCoLe. + +
+
+
+
+
+ + ☆ CALICO: Confident Active Learning with Integrated Calibration ICANN2024 + + +
+ The growing use of deep learning in safety-critical applications, such as +medical imaging, has raised concerns about limited labeled data, where this +demand is amplified as model complexity increases, posing hurdles for domain +experts to annotate data. In response to this, active learning (AL) is used to +efficiently train models with limited annotation costs. In the context of deep +neural networks (DNNs), AL often uses confidence or probability outputs as a +score for selecting the most informative samples. However, modern DNNs exhibit +unreliable confidence outputs, making calibration essential. We propose an AL +framework that self-calibrates the confidence used for sample selection during +the training process, referred to as Confident Active Learning with Integrated +CalibratiOn (CALICO). CALICO incorporates the joint training of a classifier +and an energy-based model, instead of the standard softmax-based classifier. +This approach allows for simultaneous estimation of the input data distribution +and the class probabilities during training, improving calibration without +needing an additional labeled dataset. Experimental results showcase improved +classification performance compared to a softmax-based classifier with fewer +labeled samples. Furthermore, the calibration stability of the model is +observed to depend on the prior class distribution of the data. + +
+
+ comment: Accepted to ICANN2024 +
+
+
+
+
+ + ☆ Why do LLaVA Vision-Language Models Reply to Images in English? + + +
+ We uncover a surprising multilingual bias occurring in a popular class of +multimodal vision-language models (VLMs). Including an image in the query to a +LLaVA-style VLM significantly increases the likelihood of the model returning +an English response, regardless of the language of the query. This paper +investigates the causes of this loss with a two-pronged approach that combines +extensive ablation of the design space with a mechanistic analysis of the +models' internal representations of image and text inputs. Both approaches +indicate that the issue stems in the language modelling component of the LLaVA +model. Statistically, we find that switching the language backbone for a +bilingual language model has the strongest effect on reducing this error. +Mechanistically, we provide compelling evidence that visual inputs are not +mapped to a similar space as text ones, and that intervening on intermediary +attention layers can reduce this bias. Our findings provide important insights +to researchers and engineers seeking to understand the crossover between +multimodal and multilingual spaces, and contribute to the goal of developing +capable and inclusive VLMs for non-English contexts. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ☆ MIGC++: Advanced Multi-Instance Generation Controller for Image + Synthesis + + +
+ We introduce the Multi-Instance Generation (MIG) task, which focuses on +generating multiple instances within a single image, each accurately placed at +predefined positions with attributes such as category, color, and shape, +strictly following user specifications. MIG faces three main challenges: +avoiding attribute leakage between instances, supporting diverse instance +descriptions, and maintaining consistency in iterative generation. To address +attribute leakage, we propose the Multi-Instance Generation Controller (MIGC). +MIGC generates multiple instances through a divide-and-conquer strategy, +breaking down multi-instance shading into single-instance tasks with singular +attributes, later integrated. To provide more types of instance descriptions, +we developed MIGC++. MIGC++ allows attribute control through text \& images and +position control through boxes \& masks. Lastly, we introduced the +Consistent-MIG algorithm to enhance the iterative MIG ability of MIGC and +MIGC++. This algorithm ensures consistency in unmodified regions during the +addition, deletion, or modification of instances, and preserves the identity of +instances when their attributes are changed. We introduce the COCO-MIG and +Multimodal-MIG benchmarks to evaluate these methods. Extensive experiments on +these benchmarks, along with the COCO-Position benchmark and DrawBench, +demonstrate that our methods substantially outperform existing techniques, +maintaining precise control over aspects including position, attribute, and +quantity. Project page: https://github.com/limuloo/MIGC. + +
+
+
+
+
+ + ☆ VFIMamba: Video Frame Interpolation with State Space Models + + +
+ Inter-frame modeling is pivotal in generating intermediate frames for video +frame interpolation (VFI). Current approaches predominantly rely on convolution +or attention-based models, which often either lack sufficient receptive fields +or entail significant computational overheads. Recently, Selective State Space +Models (S6) have emerged, tailored specifically for long sequence modeling, +offering both linear complexity and data-dependent modeling capabilities. In +this paper, we propose VFIMamba, a novel frame interpolation method for +efficient and dynamic inter-frame modeling by harnessing the S6 model. Our +approach introduces the Mixed-SSM Block (MSB), which initially rearranges +tokens from adjacent frames in an interleaved fashion and subsequently applies +multi-directional S6 modeling. This design facilitates the efficient +transmission of information across frames while upholding linear complexity. +Furthermore, we introduce a novel curriculum learning strategy that +progressively cultivates proficiency in modeling inter-frame dynamics across +varying motion magnitudes, fully unleashing the potential of the S6 model. +Experimental findings showcase that our method attains state-of-the-art +performance across diverse benchmarks, particularly excelling in +high-resolution scenarios. In particular, on the X-TEST dataset, VFIMamba +demonstrates a noteworthy improvement of 0.80 dB for 4K frames and 0.96 dB for +2K frames. + +
+
+
+
+
+ + ☆ Semantically Guided Representation Learning For Action Anticipation ECCV'24 + + +
+ Action anticipation is the task of forecasting future activity from a +partially observed sequence of events. However, this task is exposed to +intrinsic future uncertainty and the difficulty of reasoning upon +interconnected actions. Unlike previous works that focus on extrapolating +better visual and temporal information, we concentrate on learning action +representations that are aware of their semantic interconnectivity based on +prototypical action patterns and contextual co-occurrences. To this end, we +propose the novel Semantically Guided Representation Learning (S-GEAR) +framework. S-GEAR learns visual action prototypes and leverages language models +to structure their relationship, inducing semanticity. To gather insights on +S-GEAR's effectiveness, we test it on four action anticipation benchmarks, +obtaining improved results compared to previous works: +3.5, +2.7, and +3.5 +absolute points on Top-1 Accuracy on Epic-Kitchen 55, EGTEA Gaze+ and 50 +Salads, respectively, and +0.8 on Top-5 Recall on Epic-Kitchens 100. We further +observe that S-GEAR effectively transfers the geometric associations between +actions from language to visual prototypes. Finally, S-GEAR opens new research +frontiers in anticipation tasks by demonstrating the intricate impact of action +semantic interconnectivity. + +
+
+ comment: Accepted as a full paper at ECCV'24 with Paper ID #4140 +
+
+
+
+
+ + ☆ Rethinking Data Augmentation for Robust LiDAR Semantic Segmentation in + Adverse Weather ECCV 2024 + + +
+ Existing LiDAR semantic segmentation methods often struggle with performance +declines in adverse weather conditions. Previous research has addressed this +issue by simulating adverse weather or employing universal data augmentation +during training. However, these methods lack a detailed analysis and +understanding of how adverse weather negatively affects LiDAR semantic +segmentation performance. Motivated by this issue, we identified key factors of +adverse weather and conducted a toy experiment to pinpoint the main causes of +performance degradation: (1) Geometric perturbation due to refraction caused by +fog or droplets in the air and (2) Point drop due to energy absorption and +occlusions. Based on these findings, we propose new strategic data augmentation +techniques. First, we introduced a Selective Jittering (SJ) that jitters points +in the random range of depth (or angle) to mimic geometric perturbation. +Additionally, we developed a Learnable Point Drop (LPD) to learn vulnerable +erase patterns with Deep Q-Learning Network to approximate the point drop +phenomenon from adverse weather conditions. Without precise weather simulation, +these techniques strengthen the LiDAR semantic segmentation model by exposing +it to vulnerable conditions identified by our data-centric analysis. +Experimental results confirmed the suitability of the proposed data +augmentation methods for enhancing robustness against adverse weather +conditions. Our method attains a remarkable 39.5 mIoU on the +SemanticKITTI-to-SemanticSTF benchmark, surpassing the previous +state-of-the-art by over 5.4%p, tripling the improvement over the baseline +compared to previous methods achieved. + +
+
+ comment: 19 pages, 6 figures, accpeted in ECCV 2024 +
+
+
+
+
+ + ☆ A Refreshed Similarity-based Upsampler for Direct High-Ratio Feature + Upsampling + + +
+ Feature upsampling is a fundamental and indispensable ingredient of almost +all current network structures for image segmentation tasks. Recently, a +popular similarity-based feature upsampling pipeline has been proposed, which +utilizes a high-resolution feature as guidance to help upsample the +low-resolution deep feature based on their local similarity. Albeit achieving +promising performance, this pipeline has specific limitations: 1) HR query and +LR key features are not well aligned; 2) the similarity between query-key +features is computed based on the fixed inner product form; 3) neighbor +selection is coarsely operated on LR features, resulting in mosaic artifacts. +These shortcomings make the existing methods along this pipeline primarily +applicable to hierarchical network architectures with iterative features as +guidance and they are not readily extended to a broader range of structures, +especially for a direct high-ratio upsampling. Against the issues, we +meticulously optimize every methodological design. Specifically, we firstly +propose an explicitly controllable query-key feature alignment from both +semantic-aware and detail-aware perspectives, and then construct a +parameterized paired central difference convolution block for flexibly +calculating the similarity between the well-aligned query-key features. +Besides, we develop a fine-grained neighbor selection strategy on HR features, +which is simple yet effective for alleviating mosaic artifacts. Based on these +careful designs, we systematically construct a refreshed similarity-based +feature upsampling framework named ReSFU. Extensive experiments substantiate +that our proposed ReSFU is finely applicable to various types of architectures +in a direct high-ratio upsampling manner, and consistently achieves +satisfactory performance on different segmentation applications, showing +superior generality and ease of deployment. + +
+
+ comment: Codes are available at https://github.com/zmhhmz/ReSFU +
+
+
+
+
+ + ☆ FedIA: Federated Medical Image Segmentation with Heterogeneous + Annotation Completeness MICCAI 2024 + + +
+ Federated learning has emerged as a compelling paradigm for medical image +segmentation, particularly in light of increasing privacy concerns. However, +most of the existing research relies on relatively stringent assumptions +regarding the uniformity and completeness of annotations across clients. +Contrary to this, this paper highlights a prevalent challenge in medical +practice: incomplete annotations. Such annotations can introduce incorrectly +labeled pixels, potentially undermining the performance of neural networks in +supervised learning. To tackle this issue, we introduce a novel solution, named +FedIA. Our insight is to conceptualize incomplete annotations as noisy data +(\textit{i.e.}, low-quality data), with a focus on mitigating their adverse +effects. We begin by evaluating the completeness of annotations at the client +level using a designed indicator. Subsequently, we enhance the influence of +clients with more comprehensive annotations and implement corrections for +incomplete ones, thereby ensuring that models are trained on accurate data. Our +method's effectiveness is validated through its superior performance on two +extensively used medical image segmentation datasets, outperforming existing +solutions. The code is available at https://github.com/HUSTxyy/FedIA. + +
+
+ comment: Early accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ Aligning Human Motion Generation with Human Perceptions + + +
+ Human motion generation is a critical task with a wide range of applications. +Achieving high realism in generated motions requires naturalness, smoothness, +and plausibility. Despite rapid advancements in the field, current generation +methods often fall short of these goals. Furthermore, existing evaluation +metrics typically rely on ground-truth-based errors, simple heuristics, or +distribution distances, which do not align well with human perceptions of +motion quality. In this work, we propose a data-driven approach to bridge this +gap by introducing a large-scale human perceptual evaluation dataset, +MotionPercept, and a human motion critic model, MotionCritic, that capture +human perceptual preferences. Our critic model offers a more accurate metric +for assessing motion quality and could be readily integrated into the motion +generation pipeline to enhance generation quality. Extensive experiments +demonstrate the effectiveness of our approach in both evaluating and improving +the quality of generated human motions by aligning with human perceptions. Code +and data are publicly available at https://motioncritic.github.io/. + +
+
+ comment: Project page: https://motioncritic.github.io/ +
+
+
+
+
+ + ☆ SOAF: Scene Occlusion-aware Neural Acoustic Field + + +
+ This paper tackles the problem of novel view audio-visual synthesis along an +arbitrary trajectory in an indoor scene, given the audio-video recordings from +other known trajectories of the scene. Existing methods often overlook the +effect of room geometry, particularly wall occlusion to sound propagation, +making them less accurate in multi-room environments. In this work, we propose +a new approach called Scene Occlusion-aware Acoustic Field (SOAF) for accurate +sound generation. Our approach derives a prior for sound energy field using +distance-aware parametric sound-propagation modelling and then transforms it +based on scene transmittance learned from the input video. We extract features +from the local acoustic field centred around the receiver using a Fibonacci +Sphere to generate binaural audio for novel views with a direction-aware +attention mechanism. Extensive experiments on the real dataset~\emph{RWAVS} and +the synthetic dataset~\emph{SoundSpaces} demonstrate that our method +outperforms previous state-of-the-art techniques in audio generation. Project +page: https://github.com/huiyu-gao/SOAF/. + +
+
+
+
+
+ + ☆ Federated Distillation for Medical Image Classification: Towards + Trustworthy Computer-Aided Diagnosis + + +
+ Medical image classification plays a crucial role in computer-aided clinical +diagnosis. While deep learning techniques have significantly enhanced +efficiency and reduced costs, the privacy-sensitive nature of medical imaging +data complicates centralized storage and model training. Furthermore, +low-resource healthcare organizations face challenges related to communication +overhead and efficiency due to increasing data and model scales. This paper +proposes a novel privacy-preserving medical image classification framework +based on federated learning to address these issues, named FedMIC. The +framework enables healthcare organizations to learn from both global and local +knowledge, enhancing local representation of private data despite statistical +heterogeneity. It provides customized models for organizations with diverse +data distributions while minimizing communication overhead and improving +efficiency without compromising performance. Our FedMIC enhances robustness and +practical applicability under resource-constrained conditions. We demonstrate +FedMIC's effectiveness using four public medical image datasets for classical +medical image classification tasks. + +
+
+ comment: work in progress. arXiv admin note: text overlap with + arXiv:2401.01493 +
+
+
+
+
+ + ☆ Parameter-Selective Continual Test-Time Adaptation + + +
+ Continual Test-Time Adaptation (CTTA) aims to adapt a pretrained model to +ever-changing environments during the test time under continuous domain shifts. +Most existing CTTA approaches are based on the Mean Teacher (MT) structure, +which contains a student and a teacher model, where the student is updated +using the pseudo-labels from the teacher model, and the teacher is then updated +by exponential moving average strategy. However, these methods update the MT +model indiscriminately on all parameters of the model. That is, some critical +parameters involving sharing knowledge across different domains may be erased, +intensifying error accumulation and catastrophic forgetting. In this paper, we +introduce Parameter-Selective Mean Teacher (PSMT) method, which is capable of +effectively updating the critical parameters within the MT network under domain +shifts. First, we introduce a selective distillation mechanism in the student +model, which utilizes past knowledge to regularize novel knowledge, thereby +mitigating the impact of error accumulation. Second, to avoid catastrophic +forgetting, in the teacher model, we create a mask through Fisher information +to selectively update parameters via exponential moving average, with +preservation measures applied to crucial parameters. Extensive experimental +results verify that PSMT outperforms state-of-the-art methods across multiple +benchmark datasets. Our code is available at +\url{https://github.com/JiaxuTian/PSMT}. + +
+
+ comment: 17pages, 4 figures +
+
+
+
+
+ + ☆ GlyphDraw2: Automatic Generation of Complex Glyph Posters with Diffusion + Models and Large Language Models + + +
+ Posters play a crucial role in marketing and advertising, contributing +significantly to industrial design by enhancing visual communication and brand +visibility. With recent advances in controllable text-to-image diffusion +models, more concise research is now focusing on rendering text within +synthetic images. Despite improvements in text rendering accuracy, the field of +end-to-end poster generation remains underexplored. This complex task involves +striking a balance between text rendering accuracy and automated layout to +produce high-resolution images with variable aspect ratios. To tackle this +challenge, we propose an end-to-end text rendering framework employing a triple +cross-attention mechanism rooted in align learning, designed to create precise +poster text within detailed contextual backgrounds. Additionally, we introduce +a high-resolution dataset that exceeds 1024 pixels in image resolution. Our +approach leverages the SDXL architecture. Extensive experiments validate the +ability of our method to generate poster images featuring intricate and +contextually rich backgrounds. Codes will be available at +https://github.com/OPPO-Mente-Lab/GlyphDraw2. + +
+
+
+
+
+ + ☆ EvolBA: Evolutionary Boundary Attack under Hard-label Black Box + condition + + +
+ Research has shown that deep neural networks (DNNs) have vulnerabilities that +can lead to the misrecognition of Adversarial Examples (AEs) with specifically +designed perturbations. Various adversarial attack methods have been proposed +to detect vulnerabilities under hard-label black box (HL-BB) conditions in the +absence of loss gradients and confidence scores.However, these methods fall +into local solutions because they search only local regions of the search +space. Therefore, this study proposes an adversarial attack method named EvolBA +to generate AEs using Covariance Matrix Adaptation Evolution Strategy (CMA-ES) +under the HL-BB condition, where only a class label predicted by the target DNN +model is available. Inspired by formula-driven supervised learning, the +proposed method introduces domain-independent operators for the initialization +process and a jump that enhances search exploration. Experimental results +confirmed that the proposed method could determine AEs with smaller +perturbations than previous methods in images where the previous methods have +difficulty. + +
+
+
+
+
+ + ☆ Sign Language Recognition Based On Facial Expression and Hand Skeleton + + +
+ Sign language is a visual language used by the deaf and dumb community to +communicate. However, for most recognition methods based on monocular cameras, +the recognition accuracy is low and the robustness is poor. Even if the effect +is good on some data, it may perform poorly in other data with different +interference due to the inability to extract effective features. To solve these +problems, we propose a sign language recognition network that integrates +skeleton features of hands and facial expression. Among this, we propose a hand +skeleton feature extraction based on coordinate transformation to describe the +shape of the hand more accurately. Moreover, by incorporating facial expression +information, the accuracy and robustness of sign language recognition are +finally improved, which was verified on A Dataset for Argentinian Sign Language +and SEU's Chinese Sign Language Recognition Database (SEUCSLRD). + +
+
+ comment: 2023 38th Youth Academic Annual Conference of Chinese Association of + Automation (YAC) +
+
+
+
+
+ + ☆ LaMoD: Latent Motion Diffusion Model For Myocardial Strain Generation + + +
+ Motion and deformation analysis of cardiac magnetic resonance (CMR) imaging +videos is crucial for assessing myocardial strain of patients with abnormal +heart functions. Recent advances in deep learning-based image registration +algorithms have shown promising results in predicting motion fields from +routinely acquired CMR sequences. However, their accuracy often diminishes in +regions with subtle appearance change, with errors propagating over time. +Advanced imaging techniques, such as displacement encoding with stimulated +echoes (DENSE) CMR, offer highly accurate and reproducible motion data but +require additional image acquisition, which poses challenges in busy clinical +flows. In this paper, we introduce a novel Latent Motion Diffusion model +(LaMoD) to predict highly accurate DENSE motions from standard CMR videos. More +specifically, our method first employs an encoder from a pre-trained +registration network that learns latent motion features (also considered as +deformation-based shape features) from image sequences. Supervised by the +ground-truth motion provided by DENSE, LaMoD then leverages a probabilistic +latent diffusion model to reconstruct accurate motion from these extracted +features. Experimental results demonstrate that our proposed method, LaMoD, +significantly improves the accuracy of motion analysis in standard CMR images; +hence improving myocardial strain analysis in clinical settings for cardiac +patients. Our code will be publicly available on upon acceptance. + +
+
+
+
+
+ + ☆ MTMamba: Enhancing Multi-Task Dense Scene Understanding by Mamba-Based + Decoders + + +
+ Multi-task dense scene understanding, which learns a model for multiple dense +prediction tasks, has a wide range of application scenarios. Modeling +long-range dependency and enhancing cross-task interactions are crucial to +multi-task dense prediction. In this paper, we propose MTMamba, a novel +Mamba-based architecture for multi-task scene understanding. It contains two +types of core blocks: self-task Mamba (STM) block and cross-task Mamba (CTM) +block. STM handles long-range dependency by leveraging Mamba, while CTM +explicitly models task interactions to facilitate information exchange across +tasks. Experiments on NYUDv2 and PASCAL-Context datasets demonstrate the +superior performance of MTMamba over Transformer-based and CNN-based methods. +Notably, on the PASCAL-Context dataset, MTMamba achieves improvements of +2.08, ++5.01, and +4.90 over the previous best method in the tasks of semantic +segmentation, human parsing, and object boundary detection, respectively. The +code is available at \url{https://github.com/EnVision-Research/MTMamba}. + +
+
+
+
+
+ + ☆ Detecting Driver Fatigue With Eye Blink Behavior + + +
+ Traffic accidents, causing millions of deaths and billions of dollars in +economic losses each year globally, have become a significant issue. One of the +main causes of these accidents is drivers being sleepy or fatigued. Recently, +various studies have focused on detecting drivers' sleep/wake states using +camera-based solutions that do not require physical contact with the driver, +thereby enhancing ease of use. In this study, besides the eye blink frequency, +a driver adaptive eye blink behavior feature set have been evaluated to detect +the fatigue status. It is observed from the results that behavior of eye blink +carries useful information on fatigue detection. The developed image-based +system provides a solution that can work adaptively to the physical +characteristics of the drivers and their positions in the vehicle + +
+
+ comment: 9 pages, 4 figures 3 tables +
+
+
+
+
+ + ☆ Multi-Modal Video Dialog State Tracking in the Wild ECCV 2024 + + +
+ We present MST-MIXER - a novel video dialog model operating over a generic +multi-modal state tracking scheme. Current models that claim to perform +multi-modal state tracking fall short of two major aspects: (1) They either +track only one modality (mostly the visual input) or (2) they target synthetic +datasets that do not reflect the complexity of real-world in the wild +scenarios. Our model addresses these two limitations in an attempt to close +this crucial research gap. Specifically, MST-MIXER first tracks the most +important constituents of each input modality. Then, it predicts the missing +underlying structure of the selected constituents of each modality by learning +local latent graphs using a novel multi-modal graph structure learning method. +Subsequently, the learned local graphs and features are parsed together to form +a global graph operating on the mix of all modalities which further refines its +structure and node embeddings. Finally, the fine-grained graph node features +are used to enhance the hidden states of the backbone Vision-Language Model +(VLM). MST-MIXER achieves new state-of-the-art results on five challenging +benchmarks. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Research on Reliable and Safe Occupancy Grid Prediction in Underground + Parking Lots + + +
+ Against the backdrop of advancing science and technology, autonomous vehicle +technology has emerged as a focal point of intense scrutiny within the academic +community. Nevertheless, the challenge persists in guaranteeing the safety and +reliability of this technology when navigating intricate scenarios. While a +substantial portion of autonomous driving research is dedicated to testing in +open-air environments, such as urban roads and highways, where the myriad +variables at play are meticulously examined, enclosed indoor spaces like +underground parking lots have, to a significant extent, been overlooked in the +scholarly discourse. This discrepancy highlights a gap in derstanding the +unique challenges these confined settings pose for autonomous navigation +systems. + This study tackles indoor autonomous driving, particularly in overlooked +spaces like underground parking lots. Using CARLA's simulation platform, a +realistic parking model is created for data gathering. An occupancy grid +network then processes this data to predict vehicle paths and obstacles, +enhancing the system's perception in complex indoor environments. Ultimately, +this strategy improves safety in autonomous parking operations. The paper +meticulously evaluates the model's predictive capabilities, validating its +efficacy in the context of underground parking. Our findings confirm that the +proposed strategy successfully enhances autonomous vehicle performance in these +complex indoor settings. It equips autonomous systems with improved adaptation +to underground lots, reinforcing safety measures and dependability. This work +paves the way for future advancements and applications by addressing the +research shortfall concerning indoor parking environments, serving as a pivotal +reference point. + +
+
+ comment: 15 pages, 19 figures +
+
+
+
+
+ + ☆ Structure-Aware Consensus Network on Graphs with Few Labeled Nodes + + +
+ Graph node classification with few labeled nodes presents significant +challenges due to limited supervision. Conventional methods often exploit the +graph in a transductive learning manner. They fail to effectively utilize the +abundant unlabeled data and the structural information inherent in graphs. To +address these issues, we introduce a Structure-Aware Consensus Network (SACN) +from three perspectives. Firstly, SACN leverages a novel structure-aware +consensus learning strategy between two strongly augmented views. The proposed +strategy can fully exploit the potentially useful information of the unlabeled +nodes and the structural information of the entire graph. Secondly, SACN +uniquely integrates the graph's structural information to achieve +strong-to-strong consensus learning, improving the utilization of unlabeled +data while maintaining multiview learning. Thirdly, unlike two-branch graph +neural network-based methods, SACN is designed for multiview feature learning +within a single-branch architecture. Furthermore, a class-aware pseudolabel +selection strategy helps address class imbalance and achieve effective +weak-to-strong supervision. Extensive experiments on three benchmark datasets +demonstrate SACN's superior performance in node classification tasks, +particularly at very low label rates, outperforming state-of-the-art methods +while maintaining computational simplicity.The source code is available at +https://github.com/kunzhan/SACN + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Virtually Objective Quantification of in vitro Wound Healing Scratch + Assays with the Segment Anything Model + + +
+ The in vitro scratch assay is a widely used assay in cell biology to assess +the rate of wound closure related to a variety of therapeutic interventions. +While manual measurement is subjective and vulnerable to intra- and +interobserver variability, computer-based tools are theoretically objective, +but in practice often contain parameters which are manually adjusted +(individually per image or data set) and thereby provide a source for +subjectivity. Modern deep learning approaches typically require large annotated +training data which complicates instant applicability. In this paper, we make +use of the segment anything model, a deep foundation model based on interactive +point-prompts, which enables class-agnostic segmentation without tuning the +network's parameters based on domain specific training data. The proposed +method clearly outperformed a semi-objective baseline method that required +manual inspection and, if necessary, adjustment of parameters per image. Even +though the point prompts of the proposed approach are theoretically also a +source for subjectivity, results attested very low intra- and interobserver +variability, even compared to manual segmentation of domain experts. + +
+
+
+
+
+ + ☆ Occlusion-Aware Seamless Segmentation ECCV 2024 + + +
+ Panoramic images can broaden the Field of View (FoV), occlusion-aware +prediction can deepen the understanding of the scene, and domain adaptation can +transfer across viewing domains. In this work, we introduce a novel task, +Occlusion-Aware Seamless Segmentation (OASS), which simultaneously tackles all +these three challenges. For benchmarking OASS, we establish a new +human-annotated dataset for Blending Panoramic Amodal Seamless Segmentation, +i.e., BlendPASS. Besides, we propose the first solution UnmaskFormer, aiming at +unmasking the narrow FoV, occlusions, and domain gaps all at once. +Specifically, UnmaskFormer includes the crucial designs of Unmasking Attention +(UA) and Amodal-oriented Mix (AoMix). Our method achieves state-of-the-art +performance on the BlendPASS dataset, reaching a remarkable mAPQ of 26.58% and +mIoU of 43.66%. On public panoramic semantic segmentation datasets, i.e., +SynPASS and DensePASS, our method outperforms previous methods and obtains +45.34% and 48.08% in mIoU, respectively. The fresh BlendPASS dataset and our +source code will be made publicly available at +https://github.com/yihong-97/OASS. + +
+
+ comment: Accepted to ECCV 2024. The fresh dataset and the source code will be + made publicly available at https://github.com/yihong-97/OASS +
+
+
+
+
+ + ☆ BeNeRF: Neural Radiance Fields from a Single Blurry Image and Event + Stream ECCV 2024 + + +
+ Neural implicit representation of visual scenes has attracted a lot of +attention in recent research of computer vision and graphics. Most prior +methods focus on how to reconstruct 3D scene representation from a set of +images. In this work, we demonstrate the possibility to recover the neural +radiance fields (NeRF) from a single blurry image and its corresponding event +stream. We model the camera motion with a cubic B-Spline in SE(3) space. Both +the blurry image and the brightness change within a time interval, can then be +synthesized from the 3D scene representation given the 6-DoF poses interpolated +from the cubic B-Spline. Our method can jointly learn both the implicit neural +scene representation and recover the camera motion by minimizing the +differences between the synthesized data and the real measurements without +pre-computed camera poses from COLMAP. We evaluate the proposed method with +both synthetic and real datasets. The experimental results demonstrate that we +are able to render view-consistent latent sharp images from the learned NeRF +and bring a blurry image alive in high quality. Code and data are available at +https://github.com/WU-CVGL/BeNeRF. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ RETINA: a hardware-in-the-loop optical facility with reduced optical + aberrations + + +
+ The increasing interest in spacecraft autonomy and the complex tasks to be +accomplished by the spacecraft raise the need for a trustworthy approach to +perform Verification & Validation of Guidance, Navigation, and Control +algorithms. In the context of autonomous operations, vision-based navigation +algorithms have established themselves as effective solutions to determine the +spacecraft state in orbit with low-cost and versatile sensors. Nevertheless, +detailed testing must be performed on ground to understand the algorithm's +robustness and performance on flight hardware. Given the impossibility of +testing directly on orbit these algorithms, a dedicated simulation framework +must be developed to emulate the orbital environment in a laboratory setup. +This paper presents the design of a low-aberration optical facility called +RETINA to perform this task. RETINA is designed to accommodate cameras with +different characteristics (e.g., sensor size and focal length) while ensuring +the correct stimulation of the camera detector. A preliminary design is +performed to identify the range of possible components to be used in the +facility according to the facility requirements. Then, a detailed optical +design is performed in Zemax OpticStudio to optimize the number and +characteristics of the lenses composing the facility's optical systems. The +final design is compared against the preliminary design to show the superiority +of the optical performance achieved with this approach. This work presents also +a calibration procedure to estimate the misalignment and the centering errors +in the facility. These estimated parameters are used in a dedicated +compensation algorithm, enabling the stimulation of the camera at tens of +arcseconds of precision. Finally, two different applications are presented to +show the versatility of RETINA in accommodating different cameras and in +simulating different mission scenarios. + +
+
+
+
+
+ + ☆ WildAvatar: Web-scale In-the-wild Video Dataset for 3D Avatar Creation + + +
+ Existing human datasets for avatar creation are typically limited to +laboratory environments, wherein high-quality annotations (e.g., SMPL +estimation from 3D scans or multi-view images) can be ideally provided. +However, their annotating requirements are impractical for real-world images or +videos, posing challenges toward real-world applications on current avatar +creation methods. To this end, we propose the WildAvatar dataset, a web-scale +in-the-wild human avatar creation dataset extracted from YouTube, with +$10,000+$ different human subjects and scenes. WildAvatar is at least +$10\times$ richer than previous datasets for 3D human avatar creation. We +evaluate several state-of-the-art avatar creation methods on our dataset, +highlighting the unexplored challenges in real-world applications on avatar +creation. We also demonstrate the potential for generalizability of avatar +creation methods, when provided with data at scale. We will publicly release +our data source links and annotations, to push forward 3D human avatar creation +and other related fields for real-world applications. + +
+
+
+
+
+ + ☆ SparseSSP: 3D Subcellular Structure Prediction from Sparse-View + Transmitted Light Images ECCV 2024 + + +
+ Traditional fluorescence staining is phototoxic to live cells, slow, and +expensive; thus, the subcellular structure prediction (SSP) from transmitted +light (TL) images is emerging as a label-free, faster, low-cost alternative. +However, existing approaches utilize 3D networks for one-to-one voxel level +dense prediction, which necessitates a frequent and time-consuming Z-axis +imaging process. Moreover, 3D convolutions inevitably lead to significant +computation and GPU memory overhead. Therefore, we propose an efficient +framework, SparseSSP, predicting fluorescent intensities within the target +voxel grid in an efficient paradigm instead of relying entirely on 3D +topologies. In particular, SparseSSP makes two pivotal improvements to prior +works. First, SparseSSP introduces a one-to-many voxel mapping paradigm, which +permits the sparse TL slices to reconstruct the subcellular structure. +Secondly, we propose a hybrid dimensions topology, which folds the Z-axis +information into channel features, enabling the 2D network layers to tackle SSP +under low computational cost. We conduct extensive experiments to validate the +effectiveness and advantages of SparseSSP on diverse sparse imaging ratios, and +our approach achieves a leading performance compared to pure 3D topologies. +SparseSSP reduces imaging frequencies compared to previous dense-view SSP +(i.e., the number of imaging is reduced up to 87.5% at most), which is +significant in visualizing rapid biological dynamics on low-cost devices and +samples. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ UltraPixel: Advancing Ultra-High-Resolution Image Synthesis to New Peaks + + +
+ Ultra-high-resolution image generation poses great challenges, such as +increased semantic planning complexity and detail synthesis difficulties, +alongside substantial training resource demands. We present UltraPixel, a novel +architecture utilizing cascade diffusion models to generate high-quality images +at multiple resolutions (\textit{e.g.}, 1K to 6K) within a single model, while +maintaining computational efficiency. UltraPixel leverages semantics-rich +representations of lower-resolution images in the later denoising stage to +guide the whole generation of highly detailed high-resolution images, +significantly reducing complexity. Furthermore, we introduce implicit neural +representations for continuous upsampling and scale-aware normalization layers +adaptable to various resolutions. Notably, both low- and high-resolution +processes are performed in the most compact space, sharing the majority of +parameters with less than 3$\%$ additional parameters for high-resolution +outputs, largely enhancing training and inference efficiency. Our model +achieves fast training with reduced data requirements, producing +photo-realistic high-resolution images and demonstrating state-of-the-art +performance in extensive experiments. + +
+
+
+
+
+ + ☆ FineCLIPER: Multi-modal Fine-grained CLIP for Dynamic Facial Expression + Recognition with AdaptERs + + +
+ Dynamic Facial Expression Recognition (DFER) is crucial for understanding +human behavior. However, current methods exhibit limited performance mainly due +to the scarcity of high-quality data, the insufficient utilization of facial +dynamics, and the ambiguity of expression semantics, etc. To this end, we +propose a novel framework, named Multi-modal Fine-grained CLIP for Dynamic +Facial Expression Recognition with AdaptERs (FineCLIPER), incorporating the +following novel designs: 1) To better distinguish between similar facial +expressions, we extend the class labels to textual descriptions from both +positive and negative aspects, and obtain supervision by calculating the +cross-modal similarity based on the CLIP model; 2) Our FineCLIPER adopts a +hierarchical manner to effectively mine useful cues from DFE videos. +Specifically, besides directly embedding video frames as input (low semantic +level), we propose to extract the face segmentation masks and landmarks based +on each frame (middle semantic level) and utilize the Multi-modal Large +Language Model (MLLM) to further generate detailed descriptions of facial +changes across frames with designed prompts (high semantic level). +Additionally, we also adopt Parameter-Efficient Fine-Tuning (PEFT) to enable +efficient adaptation of large pre-trained models (i.e., CLIP) for this task. +Our FineCLIPER achieves SOTA performance on the DFEW, FERV39k, and MAFW +datasets in both supervised and zero-shot settings with few tunable parameters. +Analysis and ablation studies further validate its effectiveness. + +
+
+ comment: Project Page: https://haroldchen19.github.io/FineCLIPER-Page/ +
+
+
+
+
+ + ☆ VRBiom: A New Periocular Dataset for Biometric Applications of HMD + + +
+ With advancements in hardware, high-quality HMD devices are being developed +by numerous companies, driving increased consumer interest in AR, VR, and MR +applications. In this work, we present a new dataset, called VRBiom, of +periocular videos acquired using a Virtual Reality headset. The VRBiom, +targeted at biometric applications, consists of 900 short videos acquired from +25 individuals recorded in the NIR spectrum. These 10s long videos have been +captured using the internal tracking cameras of Meta Quest Pro at 72 FPS. To +encompass real-world variations, the dataset includes recordings under three +gaze conditions: steady, moving, and partially closed eyes. We have also +ensured an equal split of recordings without and with glasses to facilitate the +analysis of eye-wear. These videos, characterized by non-frontal views of the +eye and relatively low spatial resolutions (400 x 400), can be instrumental in +advancing state-of-the-art research across various biometric applications. The +VRBiom dataset can be utilized to evaluate, train, or adapt models for +biometric use-cases such as iris and/or periocular recognition and associated +sub-tasks such as detection and semantic segmentation. + In addition to data from real individuals, we have included around 1100 PA +constructed from 92 PA instruments. These PAIs fall into six categories +constructed through combinations of print attacks (real and synthetic +identities), fake 3D eyeballs, plastic eyes, and various types of masks and +mannequins. These PA videos, combined with genuine (bona-fide) data, can be +utilized to address concerns related to spoofing, which is a significant threat +if these devices are to be used for authentication. + The VRBiom dataset is publicly available for research purposes related to +biometric applications only. + +
+
+
+
+
+ + ☆ Hybrid Feature Collaborative Reconstruction Network for Few-Shot + Fine-Grained Image Classification + + +
+ Our research focuses on few-shot fine-grained image classification, which +faces two major challenges: appearance similarity of fine-grained objects and +limited number of samples. To preserve the appearance details of images, +traditional feature reconstruction networks usually enhance the representation +ability of key features by spatial feature reconstruction and minimizing the +reconstruction error. However, we find that relying solely on a single type of +feature is insufficient for accurately capturing inter-class differences of +fine-grained objects in scenarios with limited samples. In contrast, the +introduction of channel features provides additional information dimensions, +aiding in better understanding and distinguishing the inter-class differences +of fine-grained objects. Therefore, in this paper, we design a new Hybrid +Feature Collaborative Reconstruction Network (HFCR-Net) for few-shot +fine-grained image classification, which includes a Hybrid Feature Fusion +Process (HFFP) and a Hybrid Feature Reconstruction Process (HFRP). In HFRP, we +fuse the channel features and the spatial features. Through dynamic weight +adjustment, we aggregate the spatial dependencies between arbitrary two +positions and the correlations between different channels of each image to +increase the inter-class differences. Additionally, we introduce the +reconstruction of channel dimension in HFRP. Through the collaborative +reconstruction of channel dimension and spatial dimension, the inter-class +differences are further increased in the process of support-to-query +reconstruction, while the intra-class differences are reduced in the process of +query-to-support reconstruction. Ultimately, our extensive experiments on three +widely used fine-grained datasets demonstrate the effectiveness and superiority +of our approach. + +
+
+
+
+
+ + ☆ HRSAM: Efficiently Segment Anything in High-Resolution Images + + +
+ The Segment Anything Model (SAM) has significantly advanced interactive +segmentation but struggles with high-resolution images crucial for +high-precision segmentation. This is primarily due to the quadratic space +complexity of SAM-implemented attention and the length extrapolation issue in +common global attention. This study proposes HRSAM that integrates Flash +Attention and incorporates Plain, Shifted and newly proposed Cycle-scan Window +(PSCWin) attention to address these issues. The shifted window attention is +redesigned with padding to maintain consistent window sizes, enabling effective +length extrapolation. The cycle-scan window attention adopts the recently +developed State Space Models (SSMs) to ensure global information exchange with +minimal computational overhead. Such window-based attention allows HRSAM to +perform effective attention computations on scaled input images while +maintaining low latency. Moreover, we further propose HRSAM++ that additionally +employs a multi-scale strategy to enhance HRSAM's performance. The experiments +on the high-precision segmentation datasets HQSeg44K and DAVIS show that +high-resolution inputs enable the SAM-distilled HRSAM models to outperform the +teacher model while maintaining lower latency. Compared to the SOTAs, HRSAM +achieves a 1.56 improvement in interactive segmentation's NoC95 metric with +only 31% of the latency. HRSAM++ further enhances the performance, achieving a +1.63 improvement in NoC95 with just 38% of the latency. + +
+
+
+
+
+ + ☆ Joint-Dataset Learning and Cross-Consistent Regularization for + Text-to-Motion Retrieval + + +
+ Pose-estimation methods enable extracting human motion from common videos in +the structured form of 3D skeleton sequences. Despite great application +opportunities, effective content-based access to such spatio-temporal motion +data is a challenging problem. In this paper, we focus on the recently +introduced text-motion retrieval tasks, which aim to search for database +motions that are the most relevant to a specified natural-language textual +description (text-to-motion) and vice-versa (motion-to-text). Despite recent +efforts to explore these promising avenues, a primary challenge remains the +insufficient data available to train robust text-motion models effectively. To +address this issue, we propose to investigate joint-dataset learning - where we +train on multiple text-motion datasets simultaneously - together with the +introduction of a Cross-Consistent Contrastive Loss function (CCCL), which +regularizes the learned text-motion common space by imposing uni-modal +constraints that augment the representation ability of the trained network. To +learn a proper motion representation, we also introduce a transformer-based +motion encoder, called MoT++, which employs spatio-temporal attention to +process sequences of skeleton data. We demonstrate the benefits of the proposed +approaches on the widely-used KIT Motion-Language and HumanML3D datasets. We +perform detailed experimentation on joint-dataset learning and cross-dataset +scenarios, showing the effectiveness of each introduced module in a carefully +conducted ablation study and, in turn, pointing out the limitations of +state-of-the-art methods. + +
+
+
+
+
+ + ☆ DM3D: Distortion-Minimized Weight Pruning for Lossless 3D Object + Detection + + +
+ Applying deep neural networks to 3D point cloud processing has attracted +increasing attention due to its advanced performance in many areas, such as +AR/VR, autonomous driving, and robotics. However, as neural network models and +3D point clouds expand in size, it becomes a crucial challenge to reduce the +computational and memory overhead to meet latency and energy constraints in +real-world applications. Although existing approaches have proposed to reduce +both computational cost and memory footprint, most of them only address the +spatial redundancy in inputs, i.e. removing the redundancy of background points +in 3D data. In this paper, we propose a novel post-training weight pruning +scheme for 3D object detection that is (1) orthogonal to all existing point +cloud sparsifying methods, which determines redundant parameters in the +pretrained model that lead to minimal distortion in both locality and +confidence (detection distortion); and (2) a universal plug-and-play pruning +framework that works with arbitrary 3D detection model. This framework aims to +minimize detection distortion of network output to maximally maintain detection +precision, by identifying layer-wise sparsity based on second-order Taylor +approximation of the distortion. Albeit utilizing second-order information, we +introduced a lightweight scheme to efficiently acquire Hessian information, and +subsequently perform dynamic programming to solve the layer-wise sparsity. +Extensive experiments on KITTI, Nuscenes and ONCE datasets demonstrate that our +approach is able to maintain and even boost the detection precision on pruned +model under noticeable computation reduction (FLOPs). Noticeably, we achieve +over 3.89x, 3.72x FLOPs reduction on CenterPoint and PVRCNN model, +respectively, without mAP decrease, significantly improving the +state-of-the-art. + +
+
+
+
+
+ + ☆ MARLIN: A Cloud Integrated Robotic Solution to Support Intralogistics in + Retail + + +
+ In this paper, we present the service robot MARLIN and its integration with +the K4R platform, a cloud system for complex AI applications in retail. At its +core, this platform contains so-called semantic digital twins, a semantically +annotated representation of the retail store. MARLIN continuously exchanges +data with the K4R platform, improving the robot's capabilities in perception, +autonomous navigation, and task planning. We exploit these capabilities in a +retail intralogistics scenario, specifically by assisting store employees in +stocking shelves. We demonstrate that MARLIN is able to update the digital +representation of the retail store by detecting and classifying obstacles, +autonomously planning and executing replenishment missions, adapting to +unforeseen changes in the environment, and interacting with store employees. +Experiments are conducted in simulation, in a laboratory environment, and in a +real store. We also describe and evaluate a novel algorithm for autonomous +navigation of articulated tractor-trailer systems. The algorithm outperforms +the manufacturer's proprietary navigation approach and improves MARLIN's +navigation capabilities in confined spaces. + +
+
+
+
+
+ + ☆ Hierarchical Temporal Context Learning for Camera-based Semantic Scene + Completion ECCV 2024 + + +
+ Camera-based 3D semantic scene completion (SSC) is pivotal for predicting +complicated 3D layouts with limited 2D image observations. The existing +mainstream solutions generally leverage temporal information by roughly +stacking history frames to supplement the current frame, such straightforward +temporal modeling inevitably diminishes valid clues and increases learning +difficulty. To address this problem, we present HTCL, a novel Hierarchical +Temporal Context Learning paradigm for improving camera-based semantic scene +completion. The primary innovation of this work involves decomposing temporal +context learning into two hierarchical steps: (a) cross-frame affinity +measurement and (b) affinity-based dynamic refinement. Firstly, to separate +critical relevant context from redundant information, we introduce the pattern +affinity with scale-aware isolation and multiple independent learners for +fine-grained contextual correspondence modeling. Subsequently, to dynamically +compensate for incomplete observations, we adaptively refine the feature +sampling locations based on initially identified locations with high affinity +and their neighboring relevant regions. Our method ranks $1^{st}$ on the +SemanticKITTI benchmark and even surpasses LiDAR-based methods in terms of mIoU +on the OpenOccupancy benchmark. Our code is available on +https://github.com/Arlo0o/HTCL. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Label Anything: Multi-Class Few-Shot Semantic Segmentation with Visual + Prompts + + +
+ We present Label Anything, an innovative neural network architecture designed +for few-shot semantic segmentation (FSS) that demonstrates remarkable +generalizability across multiple classes with minimal examples required per +class. Diverging from traditional FSS methods that predominantly rely on masks +for annotating support images, Label Anything introduces varied visual prompts +-- points, bounding boxes, and masks -- thereby enhancing the framework's +versatility and adaptability. Unique to our approach, Label Anything is +engineered for end-to-end training across multi-class FSS scenarios, +efficiently learning from diverse support set configurations without +retraining. This approach enables a "universal" application to various FSS +challenges, ranging from $1$-way $1$-shot to complex $N$-way $K$-shot +configurations while remaining agnostic to the specific number of class +examples. This innovative training strategy reduces computational requirements +and substantially improves the model's adaptability and generalization across +diverse segmentation tasks. Our comprehensive experimental validation, +particularly achieving state-of-the-art results on the COCO-$20^i$ benchmark, +underscores Label Anything's robust generalization and flexibility. The source +code is publicly available at: https://github.com/pasqualedem/LabelAnything. + +
+
+
+
+
+ + ☆ LPViT: Low-Power Semi-structured Pruning for Vision Transformers + + +
+ Vision transformers have emerged as a promising alternative to convolutional +neural networks for various image analysis tasks, offering comparable or +superior performance. However, one significant drawback of ViTs is their +resource-intensive nature, leading to increased memory footprint, computation +complexity, and power consumption. To democratize this high-performance +technology and make it more environmentally friendly, it is essential to +compress ViT models, reducing their resource requirements while maintaining +high performance. In this paper, we introduce a new block-structured pruning to +address the resource-intensive issue for ViTs, offering a balanced trade-off +between accuracy and hardware acceleration. Unlike unstructured pruning or +channel-wise structured pruning, block pruning leverages the block-wise +structure of linear layers, resulting in more efficient matrix multiplications. +To optimize this pruning scheme, our paper proposes a novel hardware-aware +learning objective that simultaneously maximizes speedup and minimizes power +consumption during inference, tailored to the block sparsity structure. This +objective eliminates the need for empirical look-up tables and focuses solely +on reducing parametrized layer connections. Moreover, our paper provides a +lightweight algorithm to achieve post-training pruning for ViTs, utilizing +second-order Taylor approximation and empirical optimization to solve the +proposed hardware-aware objective. Extensive experiments on ImageNet are +conducted across various ViT architectures, including DeiT-B and DeiT-S, +demonstrating competitive performance with other pruning methods and achieving +a remarkable balance between accuracy preservation and power savings. +Especially, we achieve up to 3.93x and 1.79x speedups on dedicated hardware and +GPUs respectively for DeiT-B, and also observe an inference power reduction by +1.4x on real-world GPUs. + +
+
+
+
+
+ + ☆ CountFormer: Multi-View Crowd Counting Transformer ECCV2024 + + +
+ Multi-view counting (MVC) methods have shown their superiority over +single-view counterparts, particularly in situations characterized by heavy +occlusion and severe perspective distortions. However, hand-crafted heuristic +features and identical camera layout requirements in conventional MVC methods +limit their applicability and scalability in real-world scenarios.In this work, +we propose a concise 3D MVC framework called \textbf{CountFormer}to elevate +multi-view image-level features to a scene-level volume representation and +estimate the 3D density map based on the volume features. By incorporating a +camera encoding strategy, CountFormer successfully embeds camera parameters +into the volume query and image-level features, enabling it to handle various +camera layouts with significant differences.Furthermore, we introduce a feature +lifting module capitalized on the attention mechanism to transform image-level +features into a 3D volume representation for each camera view. Subsequently, +the multi-view volume aggregation module attentively aggregates various +multi-view volumes to create a comprehensive scene-level volume representation, +allowing CountFormer to handle images captured by arbitrary dynamic camera +layouts. The proposed method performs favorably against the state-of-the-art +approaches across various widely used datasets, demonstrating its greater +suitability for real-world deployment compared to conventional MVC frameworks. + +
+
+ comment: Accepted By ECCV2024 +
+
+
+
+
+ + ☆ ScaleDreamer: Scalable Text-to-3D Synthesis with Asynchronous Score + Distillation ECCV 2024 + + +
+ By leveraging the text-to-image diffusion priors, score distillation can +synthesize 3D contents without paired text-3D training data. Instead of +spending hours of online optimization per text prompt, recent studies have been +focused on learning a text-to-3D generative network for amortizing multiple +text-3D relations, which can synthesize 3D contents in seconds. However, +existing score distillation methods are hard to scale up to a large amount of +text prompts due to the difficulties in aligning pretrained diffusion prior +with the distribution of rendered images from various text prompts. Current +state-of-the-arts such as Variational Score Distillation finetune the +pretrained diffusion model to minimize the noise prediction error so as to +align the distributions, which are however unstable to train and will impair +the model's comprehension capability to numerous text prompts. Based on the +observation that the diffusion models tend to have lower noise prediction +errors at earlier timesteps, we propose Asynchronous Score Distillation (ASD), +which minimizes the noise prediction error by shifting the diffusion timestep +to earlier ones. ASD is stable to train and can scale up to 100k prompts. It +reduces the noise prediction error without changing the weights of pre-trained +diffusion model, thus keeping its strong comprehension capability to prompts. +We conduct extensive experiments across different 2D diffusion models, +including Stable Diffusion and MVDream, and text-to-3D generators, including +Hyper-iNGP, 3DConv-Net and Triplane-Transformer. The results demonstrate ASD's +effectiveness in stable 3D generator training, high-quality 3D content +synthesis, and its superior prompt-consistency, especially under large prompt +corpus. + +
+
+ comment: Accepted by ECCV 2024. Code available at + https://github.com/theEricMa/ScaleDreamer +
+
+
+
+
+ + ☆ Camera-LiDAR Cross-modality Gait Recognition + + +
+ Gait recognition is a crucial biometric identification technique. +Camera-based gait recognition has been widely applied in both research and +industrial fields. LiDAR-based gait recognition has also begun to evolve most +recently, due to the provision of 3D structural information. However, in +certain applications, cameras fail to recognize persons, such as in low-light +environments and long-distance recognition scenarios, where LiDARs work well. +On the other hand, the deployment cost and complexity of LiDAR systems limit +its wider application. Therefore, it is essential to consider cross-modality +gait recognition between cameras and LiDARs for a broader range of +applications. In this work, we propose the first cross-modality gait +recognition framework between Camera and LiDAR, namely CL-Gait. It employs a +two-stream network for feature embedding of both modalities. This poses a +challenging recognition task due to the inherent matching between 3D and 2D +data, exhibiting significant modality discrepancy. To align the feature spaces +of the two modalities, i.e., camera silhouettes and LiDAR points, we propose a +contrastive pre-training strategy to mitigate modality discrepancy. To make up +for the absence of paired camera-LiDAR data for pre-training, we also introduce +a strategy for generating data on a large scale. This strategy utilizes +monocular depth estimated from single RGB images and virtual cameras to +generate pseudo point clouds for contrastive pre-training. Extensive +experiments show that the cross-modality gait recognition is very challenging +but still contains potential and feasibility with our proposed model and +pre-training strategy. To the best of our knowledge, this is the first work to +address cross-modality gait recognition. + +
+
+
+
+
+ + ☆ TrAME: Trajectory-Anchored Multi-View Editing for Text-Guided 3D + Gaussian Splatting Manipulation + + +
+ Despite significant strides in the field of 3D scene editing, current methods +encounter substantial challenge, particularly in preserving 3D consistency in +multi-view editing process. To tackle this challenge, we propose a progressive +3D editing strategy that ensures multi-view consistency via a +Trajectory-Anchored Scheme (TAS) with a dual-branch editing mechanism. +Specifically, TAS facilitates a tightly coupled iterative process between 2D +view editing and 3D updating, preventing error accumulation yielded from +text-to-image process. Additionally, we explore the relationship between +optimization-based methods and reconstruction-based methods, offering a unified +perspective for selecting superior design choice, supporting the rationale +behind the designed TAS. We further present a tuning-free View-Consistent +Attention Control (VCAC) module that leverages cross-view semantic and +geometric reference from the source branch to yield aligned views from the +target branch during the editing of 2D views. To validate the effectiveness of +our method, we analyze 2D examples to demonstrate the improved consistency with +the VCAC module. Further extensive quantitative and qualitative results in +text-guided 3D scene editing indicate that our method achieves superior editing +quality compared to state-of-the-art methods. We will make the complete +codebase publicly available following the conclusion of the double-blind review +process. + +
+
+
+
+
+ + ☆ Multi-Grained Contrast for Data-Efficient Unsupervised Representation + Learning + + +
+ The existing contrastive learning methods mainly focus on single-grained +representation learning, e.g., part-level, object-level or scene-level ones, +thus inevitably neglecting the transferability of representations on other +granularity levels. In this paper, we aim to learn multi-grained +representations, which can effectively describe the image on various +granularity levels, thus improving generalization on extensive downstream +tasks. To this end, we propose a novel Multi-Grained Contrast method (MGC) for +unsupervised representation learning. Specifically, we construct delicate +multi-grained correspondences between positive views and then conduct +multi-grained contrast by the correspondences to learn more general +unsupervised representations. + Without pretrained on large-scale dataset, our method significantly +outperforms the existing state-of-the-art methods on extensive downstream +tasks, including object detection, instance segmentation, scene parsing, +semantic segmentation and keypoint detection. Moreover, experimental results +support the data-efficient property and excellent representation +transferability of our method. The source code and trained weights are +available at \url{https://github.com/visresearch/mgc}. + +
+
+
+
+
+ + ☆ SAVE: Segment Audio-Visual Easy way using Segment Anything Model + + +
+ The primary aim of Audio-Visual Segmentation (AVS) is to precisely identify +and locate auditory elements within visual scenes by accurately predicting +segmentation masks at the pixel level. Achieving this involves comprehensively +considering data and model aspects to address this task effectively. This study +presents a lightweight approach, SAVE, which efficiently adapts the pre-trained +segment anything model (SAM) to the AVS task. By incorporating an image encoder +adapter into the transformer blocks to better capture the distinct dataset +information and proposing a residual audio encoder adapter to encode the audio +features as a sparse prompt, our proposed model achieves effective audio-visual +fusion and interaction during the encoding stage. Our proposed method +accelerates the training and inference speed by reducing the input resolution +from 1024 to 256 pixels while achieving higher performance compared with the +previous SOTA. Extensive experimentation validates our approach, demonstrating +that our proposed model outperforms other SOTA methods significantly. Moreover, +leveraging the pre-trained model on synthetic data enhances performance on real +AVSBench data, achieving 84.59 mIoU on the S4 (V1S) subset and 70.28 mIoU on +the MS3 (V1M) set with only 256 pixels for input images. This increases up to +86.16 mIoU on the S4 (V1S) and 70.83 mIoU on the MS3 (V1M) with inputs of 1024 +pixels. + +
+
+
+
+
+ + ☆ ViG-Bias: Visually Grounded Bias Discovery and Mitigation ECCV 2024 + + +
+ The proliferation of machine learning models in critical decision making +processes has underscored the need for bias discovery and mitigation +strategies. Identifying the reasons behind a biased system is not +straightforward, since in many occasions they are associated with hidden +spurious correlations which are not easy to spot. Standard approaches rely on +bias audits performed by analyzing model performance in pre-defined subgroups +of data samples, usually characterized by common attributes like gender or +ethnicity when it comes to people, or other specific attributes defining +semantically coherent groups of images. However, it is not always possible to +know a-priori the specific attributes defining the failure modes of visual +recognition systems. Recent approaches propose to discover these groups by +leveraging large vision language models, which enable the extraction of +cross-modal embeddings and the generation of textual descriptions to +characterize the subgroups where a certain model is underperforming. In this +work, we argue that incorporating visual explanations (e.g. heatmaps generated +via GradCAM or other approaches) can boost the performance of such bias +discovery and mitigation frameworks. To this end, we introduce Visually +Grounded Bias Discovery and Mitigation (ViG-Bias), a simple yet effective +technique which can be integrated to a variety of existing frameworks to +improve both, discovery and mitigation performance. Our comprehensive +evaluation shows that incorporating visual explanations enhances existing +techniques like DOMINO, FACTS and Bias-to-Text, across several challenging +datasets, including CelebA, Waterbirds, and NICO++. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ AHMsys: An Automated HVAC Modeling System for BIM Project + + +
+ This paper presents a novel system, named AHMsys, designed to automate the +process of generating 3D Heating, Ventilation, and Air Conditioning (HVAC) +models from 2D Computer-Aided Design (CAD) drawings, a key component of +Building Information Modeling (BIM). By automatically preprocessing and +extracting essential HVAC object information then creating detailed 3D models, +our proposed AHMsys significantly reduced the 20 percent work schedule of the +BIM process in Akila. This advancement highlights the essential impact of +integrating AI technologies in managing the lifecycle of a digital +representation of the building. + +
+
+
+
+
+ + ☆ SADL: An Effective In-Context Learning Method for Compositional Visual + QA + + +
+ Large vision-language models (LVLMs) offer a novel capability for performing +in-context learning (ICL) in Visual QA. When prompted with a few demonstrations +of image-question-answer triplets, LVLMs have demonstrated the ability to +discern underlying patterns and transfer this latent knowledge to answer new +questions about unseen images without the need for expensive supervised +fine-tuning. However, designing effective vision-language prompts, especially +for compositional questions, remains poorly understood. Adapting language-only +ICL techniques may not necessarily work because we need to bridge the +visual-linguistic semantic gap: Symbolic concepts must be grounded in visual +content, which does not share the syntactic linguistic structures. This paper +introduces SADL, a new visual-linguistic prompting framework for the task. SADL +revolves around three key components: SAmpling, Deliberation, and +Pseudo-Labeling of image-question pairs. Given an image-question query, we +sample image-question pairs from the training data that are in semantic +proximity to the query. To address the compositional nature of questions, the +deliberation step decomposes complex questions into a sequence of subquestions. +Finally, the sequence is progressively annotated one subquestion at a time to +generate a sequence of pseudo-labels. We investigate the behaviors of SADL +under OpenFlamingo on large-scale Visual QA datasets, namely GQA, GQA-OOD, +CLEVR, and CRIC. The evaluation demonstrates the critical roles of sampling in +the neighborhood of the image, the decomposition of complex questions, and the +accurate pairing of the subquestions and labels. These findings do not always +align with those found in language-only ICL, suggesting fresh insights in +vision-language settings. + +
+
+
+
+
+ + ☆ Pseudo-Labeling by Multi-Policy Viewfinder Network for Image Cropping + + +
+ Automatic image cropping models predict reframing boxes to enhance image +aesthetics. Yet, the scarcity of labeled data hinders the progress of this +task. To overcome this limitation, we explore the possibility of utilizing both +labeled and unlabeled data together to expand the scale of training data for +image cropping models. This idea can be implemented in a pseudo-labeling way: +producing pseudo labels for unlabeled data by a teacher model and training a +student model with these pseudo labels. However, the student may learn from +teacher's mistakes. To address this issue, we propose the multi-policy +viewfinder network (MPV-Net) that offers diverse refining policies to rectify +the mistakes in original pseudo labels from the teacher. The most reliable +policy is selected to generate trusted pseudo labels. The reliability of +policies is evaluated via the robustness against box jittering. The efficacy of +our method can be evaluated by the improvement compared to the supervised +baseline which only uses labeled data. Notably, our MPV-Net outperforms +off-the-shelf pseudo-labeling methods, yielding the most substantial +improvement over the supervised baseline. Furthermore, our approach achieves +state-of-the-art results on both the FCDB and FLMS datasets, signifying the +superiority of our approach. + +
+
+ comment: 18 pages, 8figures +
+
+
+
+
+ + ☆ Unleash the Power of Local Representations for Few-Shot Classification + + +
+ Generalizing to novel classes unseen during training is a key challenge of +few-shot classification. Recent metric-based methods try to address this by +local representations. However, they are unable to take full advantage of them +due to (i) improper supervision for pretraining the feature extractor, and (ii) +lack of adaptability in the metric for handling various possible compositions +of local feature sets. In this work, we unleash the power of local +representations in improving novel-class generalization. For the feature +extractor, we design a novel pretraining paradigm that learns randomly cropped +patches by soft labels. It utilizes the class-level diversity of patches while +diminishing the impact of their semantic misalignments to hard labels. To align +network output with soft labels, we also propose a UniCon KL-Divergence that +emphasizes the equal contribution of each base class in describing "non-base" +patches. For the metric, we formulate measuring local feature sets as an +entropy-regularized optimal transport problem to introduce the ability to +handle sets consisting of homogeneous elements. Furthermore, we design a +Modulate Module to endow the metric with the necessary adaptability. Our method +achieves new state-of-the-art performance on three popular benchmarks. +Moreover, it exceeds state-of-the-art transductive and cross-modal methods in +the fine-grained scenario. + +
+
+
+
+
+ + ☆ Zero-shot Video Restoration and Enhancement Using Pre-Trained Image + Diffusion Model + + +
+ Diffusion-based zero-shot image restoration and enhancement models have +achieved great success in various image restoration and enhancement tasks +without training. However, directly applying them to video restoration and +enhancement results in severe temporal flickering artifacts. In this paper, we +propose the first framework for zero-shot video restoration and enhancement +based on a pre-trained image diffusion model. By replacing the self-attention +layer with the proposed cross-previous-frame attention layer, the pre-trained +image diffusion model can take advantage of the temporal correlation between +neighboring frames. We further propose temporal consistency guidance, +spatial-temporal noise sharing, and an early stopping sampling strategy for +better temporally consistent sampling. Our method is a plug-and-play module +that can be inserted into any diffusion-based zero-shot image restoration or +enhancement methods to further improve their performance. Experimental results +demonstrate the superiority of our proposed method in producing temporally +consistent videos with better fidelity. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ FlowTrack: Point-level Flow Network for 3D Single Object Tracking IROS2024 + + +
+ 3D single object tracking (SOT) is a crucial task in fields of mobile +robotics and autonomous driving. Traditional motion-based approaches achieve +target tracking by estimating the relative movement of target between two +consecutive frames. However, they usually overlook local motion information of +the target and fail to exploit historical frame information effectively. To +overcome the above limitations, we propose a point-level flow method with +multi-frame information for 3D SOT task, called FlowTrack. Specifically, by +estimating the flow for each point in the target, our method could capture the +local motion details of target, thereby improving the tracking performance. At +the same time, to handle scenes with sparse points, we present a learnable +target feature as the bridge to efficiently integrate target information from +past frames. Moreover, we design a novel Instance Flow Head to transform dense +point-level flow into instance-level motion, effectively aggregating local +motion information to obtain global target motion. Finally, our method achieves +competitive performance with improvements of 5.9% on the KITTI dataset and 2.9% +on NuScenes. The code will be made publicly available soon. + +
+
+ comment: Accepted by IROS2024 +
+
+
+
+
+ + ☆ Indoor 3D Reconstruction with an Unknown Camera-Projector Pair + + +
+ Structured light-based method with a camera-projector pair (CPP) plays a +vital role in indoor 3D reconstruction, especially for scenes with weak +textures. Previous methods usually assume known intrinsics, which are +pre-calibrated from known objects, or self-calibrated from multi-view +observations. It is still challenging to reliably recover CPP intrinsics from +only two views without any known objects. In this paper, we provide a simple +yet reliable solution. We demonstrate that, for the first time, sufficient +constraints on CPP intrinsics can be derived from an unknown cuboid corner +(C2), e.g. a room's corner, which is a common structure in indoor scenes. In +addition, with only known camera principal point, the complex multi-variable +estimation of all CPP intrinsics can be simplified to a simple univariable +optimization problem, leading to reliable calibration and thus direct 3D +reconstruction with unknown CPP. Extensive results have demonstrated the +superiority of the proposed method over both traditional and learning-based +counterparts. Furthermore, the proposed method also demonstrates impressive +potential to solve similar tasks without active lighting, such as sparse-view +structure from motion. + +
+
+
+
+
+ + ☆ Certainly Uncertain: A Benchmark and Metric for Multimodal Epistemic and + Aleatoric Awareness + + +
+ The ability to acknowledge the inevitable uncertainty in their knowledge and +reasoning is a prerequisite for AI systems to be truly truthful and reliable. +In this paper, we present a taxonomy of uncertainty specific to vision-language +AI systems, distinguishing between epistemic uncertainty (arising from a lack +of information) and aleatoric uncertainty (due to inherent unpredictability), +and further explore finer categories within. Based on this taxonomy, we +synthesize a benchmark dataset, CertainlyUncertain, featuring 178K visual +question answering (VQA) samples as contrastive pairs. This is achieved by 1) +inpainting images to make previously answerable questions into unanswerable +ones; and 2) using image captions to prompt large language models for both +answerable and unanswerable questions. Additionally, we introduce a new metric +confidence-weighted accuracy, that is well correlated with both accuracy and +calibration error, to address the shortcomings of existing metrics. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Probabilistic 3D Correspondence Prediction from Sparse Unsegmented + Images + + +
+ The study of physiology demonstrates that the form (shape)of anatomical +structures dictates their functions, and analyzing the form of anatomies plays +a crucial role in clinical research. Statistical shape modeling (SSM) is a +widely used tool for quantitative analysis of forms of anatomies, aiding in +characterizing and identifying differences within a population of subjects. +Despite its utility, the conventional SSM construction pipeline is often +complex and time-consuming. Additionally, reliance on linearity assumptions +further limits the model from capturing clinically relevant variations. Recent +advancements in deep learning solutions enable the direct inference of SSM from +unsegmented medical images, streamlining the process and improving +accessibility. However, the new methods of SSM from images do not adequately +account for situations where the imaging data quality is poor or where only +sparse information is available. Moreover, quantifying aleatoric uncertainty, +which represents inherent data variability, is crucial in deploying deep +learning for clinical tasks to ensure reliable model predictions and robust +decision-making, especially in challenging imaging conditions. Therefore, we +propose SPI-CorrNet, a unified model that predicts 3D correspondences from +sparse imaging data. It leverages a teacher network to regularize feature +learning and quantifies data-dependent aleatoric uncertainty by adapting the +network to predict intrinsic input variances. Experiments on the LGE MRI left +atrium dataset and Abdomen CT-1K liver datasets demonstrate that our technique +enhances the accuracy and robustness of sparse image-driven SSM. + +
+
+
+
+
+ + ☆ Self-Cooperation Knowledge Distillation for Novel Class Discovery ECCV2024 + + +
+ Novel Class Discovery (NCD) aims to discover unknown and novel classes in an +unlabeled set by leveraging knowledge already learned about known classes. +Existing works focus on instance-level or class-level knowledge representation +and build a shared representation space to achieve performance improvements. +However, a long-neglected issue is the potential imbalanced number of samples +from known and novel classes, pushing the model towards dominant classes. +Therefore, these methods suffer from a challenging trade-off between reviewing +known classes and discovering novel classes. Based on this observation, we +propose a Self-Cooperation Knowledge Distillation (SCKD) method to utilize each +training sample (whether known or novel, labeled or unlabeled) for both review +and discovery. Specifically, the model's feature representations of known and +novel classes are used to construct two disjoint representation spaces. Through +spatial mutual information, we design a self-cooperation learning to encourage +model learning from the two feature representation spaces from itself. +Extensive experiments on six datasets demonstrate that our method can achieve +significant performance improvements, achieving state-of-the-art performance. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ SymPoint Revolutionized: Boosting Panoptic Symbol Spotting with Layer + Feature Enhancement + + +
+ SymPoint is an initial attempt that utilizes point set representation to +solve the panoptic symbol spotting task on CAD drawing. Despite its +considerable success, it overlooks graphical layer information and suffers from +prohibitively slow training convergence. To tackle this issue, we introduce +SymPoint-V2, a robust and efficient solution featuring novel, streamlined +designs that overcome these limitations. In particular, we first propose a +Layer Feature-Enhanced module (LFE) to encode the graphical layer information +into the primitive feature, which significantly boosts the performance. We also +design a Position-Guided Training (PGT) method to make it easier to learn, +which accelerates the convergence of the model in the early stages and further +promotes performance. Extensive experiments show that our model achieves better +performance and faster convergence than its predecessor SymPoint on the public +benchmark. Our code and trained models are available at +https://github.com/nicehuster/SymPointV2. + +
+
+ comment: code at https://github.com/nicehuster/SymPointV2 +
+
+
+
+
+ + ☆ Chemical Shift Encoding based Double Bonds Quantification in + Triglycerides using Deep Image Prior + + +
+ This study evaluated a deep learning-based method using Deep Image Prior +(DIP) to quantify triglyceride double bonds from chemical-shift encoded +multi-echo gradient echo images without network training. We employed a cost +function based on signal constraints to iteratively update the neural network +on a single dataset. The method was validated using phantom experiments and in +vivo scans. Results showed close alignment between measured and reference +double bond values, with phantom experiments yielding a Pearson correlation +coefficient of 0.96 (p = .0005). In vivo results demonstrated good agreement in +subcutaneous fat. We conclude that Deep Image Prior shows feasibility for +quantifying double bonds and fatty acid content from chemical-shift encoded +multi-echo MRI. + +
+
+
+
+
+ + ☆ Looking From the Future: Multi-order Iterations Can Enhance Adversarial + Attack Transferability + + +
+ Various methods try to enhance adversarial transferability by improving the +generalization from different perspectives. In this paper, we rethink the +optimization process and propose a novel sequence optimization concept, which +is named Looking From the Future (LFF). LFF makes use of the original +optimization process to refine the very first local optimization choice. +Adapting the LFF concept to the adversarial attack task, we further propose an +LFF attack as well as an MLFF attack with better generalization ability. +Furthermore, guiding with the LFF concept, we propose an $LLF^{\mathcal{N}}$ +attack which entends the LFF attack to a multi-order attack, further enhancing +the transfer attack ability. All our proposed methods can be directly applied +to the iteration-based attack methods. We evaluate our proposed method on the +ImageNet1k dataset by applying several SOTA adversarial attack methods under +four kinds of tasks. Experimental results show that our proposed method can +greatly enhance the attack transferability. Ablation experiments are also +applied to verify the effectiveness of each component. The source code will be +released after this paper is accepted. + +
+
+
+
+
+ + ☆ GVDIFF: Grounded Text-to-Video Generation with Diffusion Models + + +
+ In text-to-video (T2V) generation, significant attention has been directed +toward its development, yet unifying discrete and continuous grounding +conditions in T2V generation remains under-explored. This paper proposes a +Grounded text-to-Video generation framework, termed GVDIFF. First, we inject +the grounding condition into the self-attention through an uncertainty-based +representation to explicitly guide the focus of the network. Second, we +introduce a spatial-temporal grounding layer that connects the grounding +condition with target objects and enables the model with the grounded +generation capacity in the spatial-temporal domain. Third, our dynamic gate +network adaptively skips the redundant grounding process to selectively extract +grounding information and semantics while improving efficiency. We extensively +evaluate the grounded generation capacity of GVDIFF and demonstrate its +versatility in applications, including long-range video generation, sequential +prompts, and object-specific editing. + +
+
+
+
+
+ + ☆ To Forget or Not? Towards Practical Knowledge Unlearning for Large + Language Models + + +
+ Large Language Models (LLMs) trained on extensive corpora inevitably retain +sensitive data, such as personal privacy information and copyrighted material. +Recent advancements in knowledge unlearning involve updating LLM parameters to +erase specific knowledge. However, current unlearning paradigms are mired in +vague forgetting boundaries, often erasing knowledge indiscriminately. In this +work, we introduce KnowUnDo, a benchmark containing copyrighted content and +user privacy domains to evaluate if the unlearning process inadvertently erases +essential knowledge. Our findings indicate that existing unlearning methods +often suffer from excessive unlearning. To address this, we propose a simple +yet effective method, MemFlex, which utilizes gradient information to precisely +target and unlearn sensitive parameters. Experimental results show that MemFlex +is superior to existing methods in both precise knowledge unlearning and +general knowledge retaining of LLMs. Code and dataset will be released at +https://github.com/zjunlp/KnowUnDo. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ A Method to Facilitate Membership Inference Attacks in Deep Learning + Models NDSS'25 + + +
+ Modern machine learning (ML) ecosystems offer a surging number of ML +frameworks and code repositories that can greatly facilitate the development of +ML models. Today, even ordinary data holders who are not ML experts can apply +off-the-shelf codebase to build high-performance ML models on their data, many +of which are sensitive in nature (e.g., clinical records). + In this work, we consider a malicious ML provider who supplies model-training +code to the data holders, does not have access to the training process, and has +only black-box query access to the resulting model. In this setting, we +demonstrate a new form of membership inference attack that is strictly more +powerful than prior art. Our attack empowers the adversary to reliably +de-identify all the training samples (average >99% attack TPR@0.1% FPR), and +the compromised models still maintain competitive performance as their +uncorrupted counterparts (average <1% accuracy drop). Moreover, we show that +the poisoned models can effectively disguise the amplified membership leakage +under common membership privacy auditing, which can only be revealed by a set +of secret samples known by the adversary. + Overall, our study not only points to the worst-case membership privacy +leakage, but also unveils a common pitfall underlying existing privacy auditing +methods, which calls for future efforts to rethink the current practice of +auditing membership privacy in machine learning models. + +
+
+ comment: NDSS'25 (a shorter version of this paper will appear in the + conference proceeding) +
+
+
+
+
+ + ☆ Efficient Stochastic Differential Equation for DEM Super Resolution with + Void Filling + + +
+ Digital Elevation Model (DEM) plays a fundamental role in remote sensing and +photogrammetry. Enhancing the quality of DEM is crucial for various +applications. Although multiple types of defects may appear simultaneously in +the same DEM, they are commonly addressed separately. Most existing approaches +only aim to fill the DEM voids, or apply super-resolution to the intact DEM. +This paper introduces a unified generative model that simultaneously addresses +voids and low-resolution problems, rather than taking two separate measures. +The proposed approach presents the DEM Stochastic Differential Equation +(DEM-SDE) for unified DEM quality enhancement. The DEM degradation of +downsampling and random voids adding is modeled as the SDE forwarding, and the +restoration is achieved by simulating the corresponding revert process. +Conditioned on the terrain feature, and adopting efficient submodules with +lightweight channel attention, DEM-SDE simultaneously enhances the DEM quality +with an efficient process for training. The experiments show that DEM-SDE +method achieves highly competitive performance in simultaneous super-resolution +and void filling compared to the state-of-the-art work. DEM-SDE also manifests +robustness for larger DEM patches. + +
+
+
+
+
+ + ☆ The Solution for the ICCV 2023 Perception Test Challenge 2023 -- Task 6 + -- Grounded videoQA + + +
+ In this paper, we introduce a grounded video question-answering solution. Our +research reveals that the fixed official baseline method for video question +answering involves two main steps: visual grounding and object tracking. +However, a significant challenge emerges during the initial step, where +selected frames may lack clearly identifiable target objects. Furthermore, +single images cannot address questions like "Track the container from which the +person pours the first time." To tackle this issue, we propose an alternative +two-stage approach:(1) First, we leverage the VALOR model to answer questions +based on video information.(2) concatenate the answered questions with their +respective answers. Finally, we employ TubeDETR to generate bounding boxes for +the targets. + +
+
+
+
+
+ + ☆ Enhancing Multi-Class Anomaly Detection via Diffusion Refinement with + Dual Conditioning + + +
+ Anomaly detection, the technique of identifying abnormal samples using only +normal samples, has attracted widespread interest in industry. Existing +one-model-per-category methods often struggle with limited generalization +capabilities due to their focus on a single category, and can fail when +encountering variations in product. Recent feature reconstruction methods, as +representatives in one-model-all-categories schemes, face challenges including +reconstructing anomalous samples and blurry reconstructions. In this paper, we +creatively combine a diffusion model and a transformer for multi-class anomaly +detection. This approach leverages diffusion to obtain high-frequency +information for refinement, greatly alleviating the blurry reconstruction +problem while maintaining the sampling efficiency of the reverse diffusion +process. The task is transformed into image inpainting to disconnect the +input-output correlation, thereby mitigating the "identical shortcuts" problem +and avoiding the model from reconstructing anomalous samples. Besides, we +introduce category-awareness using dual conditioning to ensure the accuracy of +prediction and reconstruction in the reverse diffusion process, preventing +excessive deviation from the target category, thus effectively enabling +multi-class anomaly detection. Futhermore, Spatio-temporal fusion is also +employed to fuse heatmaps predicted at different timesteps and scales, +enhancing the performance of multi-class anomaly detection. Extensive +experiments on benchmark datasets demonstrate the superior performance and +exceptional multi-class anomaly detection capabilities of our proposed method +compared to others. + +
+
+
+
+
+ + ☆ Text-Aware Diffusion for Policy Learning + + +
+ Training an agent to achieve particular goals or perform desired behaviors is +often accomplished through reinforcement learning, especially in the absence of +expert demonstrations. However, supporting novel goals or behaviors through +reinforcement learning requires the ad-hoc design of appropriate reward +functions, which quickly becomes intractable. To address this challenge, we +propose Text-Aware Diffusion for Policy Learning (TADPoLe), which uses a +pretrained, frozen text-conditioned diffusion model to compute dense zero-shot +reward signals for text-aligned policy learning. We hypothesize that +large-scale pretrained generative models encode rich priors that can supervise +a policy to behave not only in a text-aligned manner, but also in alignment +with a notion of naturalness summarized from internet-scale training data. In +our experiments, we demonstrate that TADPoLe is able to learn policies for +novel goal-achievement and continuous locomotion behaviors specified by natural +language, in both Humanoid and Dog environments. The behaviors are learned +zero-shot without ground-truth rewards or expert demonstrations, and are +qualitatively more natural according to human evaluation. We further show that +TADPoLe performs competitively when applied to robotic manipulation tasks in +the Meta-World environment. + +
+
+
+
+
+ + ☆ Adaptive Modality Balanced Online Knowledge Distillation for + Brain-Eye-Computer based Dim Object Detection + + +
+ Advanced cognition can be extracted from the human brain using brain-computer +interfaces. Integrating these interfaces with computer vision techniques, which +possess efficient feature extraction capabilities, can achieve more robust and +accurate detection of dim targets in aerial images. However, existing target +detection methods primarily concentrate on homogeneous data, lacking efficient +and versatile processing capabilities for heterogeneous multimodal data. In +this paper, we first build a brain-eye-computer based object detection system +for aerial images under few-shot conditions. This system detects suspicious +targets using region proposal networks, evokes the event-related potential +(ERP) signal in electroencephalogram (EEG) through the eye-tracking-based slow +serial visual presentation (ESSVP) paradigm, and constructs the EEG-image data +pairs with eye movement data. Then, an adaptive modality balanced online +knowledge distillation (AMBOKD) method is proposed to recognize dim objects +with the EEG-image data. AMBOKD fuses EEG and image features using a multi-head +attention module, establishing a new modality with comprehensive features. To +enhance the performance and robust capability of the fusion modality, +simultaneous training and mutual learning between modalities are enabled by +end-to-end online knowledge distillation. During the learning process, an +adaptive modality balancing module is proposed to ensure multimodal equilibrium +by dynamically adjusting the weights of the importance and the training +gradients across various modalities. The effectiveness and superiority of our +method are demonstrated by comparing it with existing state-of-the-art methods. +Additionally, experiments conducted on public datasets and system validations +in real-world scenarios demonstrate the reliability and practicality of the +proposed system and the designed method. + +
+
+ comment: 18 pages,15 figures +
+
+
+
+
+ + ☆ PO-MSCKF: An Efficient Visual-Inertial Odometry by Reconstructing the + Multi-State Constrained Kalman Filter with the Pose-only Theory + + +
+ Efficient Visual-Inertial Odometry (VIO) is crucial for payload-constrained +robots. Though modern optimization-based algorithms have achieved superior +accuracy, the MSCKF-based VIO algorithms are still widely demanded for their +efficient and consistent performance. As MSCKF is built upon the conventional +multi-view geometry, the measured residuals are not only related to the state +errors but also related to the feature position errors. To apply EKF fusion, a +projection process is required to remove the feature position error from the +observation model, which can lead to model and accuracy degradation. To obtain +an efficient visual-inertial fusion model, while also preserving the model +consistency, we propose to reconstruct the MSCKF VIO with the novel Pose-Only +(PO) multi-view geometry description. In the newly constructed filter, we have +modeled PO reprojection residuals, which are solely related to the motion +states and thus overcome the requirements of space projection. Moreover, the +new filter does not require any feature position information, which removes the +computational cost and linearization errors brought in by the 3D reconstruction +procedure. We have conducted comprehensive experiments on multiple datasets, +where the proposed method has shown accuracy improvements and consistent +performance in challenging sequences. + +
+
+
+
+
+ + ☆ EIT-1M: One Million EEG-Image-Text Pairs for Human Visual-textual + Recognition and More + + +
+ Recently, electroencephalography (EEG) signals have been actively +incorporated to decode brain activity to visual or textual stimuli and achieve +object recognition in multi-modal AI. Accordingly, endeavors have been focused +on building EEG-based datasets from visual or textual single-modal stimuli. +However, these datasets offer limited EEG epochs per category, and the complex +semantics of stimuli presented to participants compromise their quality and +fidelity in capturing precise brain activity. The study in neuroscience unveils +that the relationship between visual and textual stimulus in EEG recordings +provides valuable insights into the brain's ability to process and integrate +multi-modal information simultaneously. Inspired by this, we propose a novel +large-scale multi-modal dataset, named EIT-1M, with over 1 million +EEG-image-text pairs. Our dataset is superior in its capacity of reflecting +brain activities in simultaneously processing multi-modal information. To +achieve this, we collected data pairs while participants viewed alternating +sequences of visual-textual stimuli from 60K natural images and +category-specific texts. Common semantic categories are also included to elicit +better reactions from participants' brains. Meanwhile, response-based stimulus +timing and repetition across blocks and sessions are included to ensure data +diversity. To verify the effectiveness of EIT-1M, we provide an in-depth +analysis of EEG data captured from multi-modal stimuli across different +categories and participants, along with data quality scores for transparency. +We demonstrate its validity on two tasks: 1) EEG recognition from visual or +textual stimuli or both and 2) EEG-to-visual generation. + +
+
+
+
+
+ + ♻ ☆ ImageFlowNet: Forecasting Multiscale Trajectories of Disease Progression + with Irregularly-Sampled Longitudinal Medical Images + + +
+ The forecasting of disease progression from images is a holy grail for +clinical decision making. However, this task is complicated by the inherent +high dimensionality, temporal sparsity and sampling irregularity in +longitudinal image acquisitions. Existing methods often rely on extracting +hand-crafted features and performing time-series analysis in this vector space, +leading to a loss of rich spatial information within the images. To overcome +these challenges, we introduce ImageFlowNet, a novel framework that learns +latent-space flow fields that evolve multiscale representations in joint +embedding spaces using neural ODEs and SDEs to model disease progression in the +image domain. Notably, ImageFlowNet learns multiscale joint representation +spaces by combining cohorts of patients together so that information can be +transferred between the patient samples. The dynamics then provide plausible +trajectories of progression, with the SDE providing alternative trajectories +from the same starting point. We provide theoretical insights that support our +formulation of ODEs, and motivate our regularizations involving high-level +visual features, latent space organization, and trajectory smoothness. We then +demonstrate ImageFlowNet's effectiveness through empirical evaluations on three +longitudinal medical image datasets depicting progression in retinal geographic +atrophy, multiple sclerosis, and glioblastoma. + +
+
+ comment: Included reference to codebase. Added acknowledgements +
+
+
+
+
+ + ♻ ☆ Benchmarking bias: Expanding clinical AI model card to incorporate bias + reporting of social and non-social factors + + +
+ Clinical AI model reporting cards should be expanded to incorporate a broad +bias reporting of both social and non-social factors. Non-social factors +consider the role of other factors, such as disease dependent, anatomic, or +instrument factors on AI model bias, which are essential to ensure safe +deployment. + +
+
+
+
+
+ + ♻ ☆ Enhancing Deep Neural Network Training Efficiency and Performance + through Linear Prediction + + +
+ Deep neural networks (DNN) have achieved remarkable success in various +fields, including computer vision and natural language processing. However, +training an effective DNN model still poses challenges. This paper aims to +propose a method to optimize the training effectiveness of DNN, with the goal +of improving model performance. Firstly, based on the observation that the DNN +parameters change in certain laws during training process, the potential of +parameter prediction for improving model training efficiency and performance is +discovered. Secondly, considering the magnitude of DNN model parameters, +hardware limitations and characteristics of Stochastic Gradient Descent (SGD) +for noise tolerance, a Parameter Linear Prediction (PLP) method is exploit to +perform DNN parameter prediction. Finally, validations are carried out on some +representative backbones. Experiment results show that compare to the normal +training ways, under the same training conditions and epochs, by employing +proposed PLP method, the optimal model is able to obtain average about 1% +accuracy improvement and 0.01 top-1/top-5 error reduction for Vgg16, Resnet18 +and GoogLeNet based on CIFAR-100 dataset, which shown the effectiveness of the +proposed method on different DNN structures, and validated its capacity in +enhancing DNN training efficiency and performance. + +
+
+
+
+
+ + ♻ ☆ ColorizeDiffusion: Adjustable Sketch Colorization with Reference Image + and Text + + +
+ Diffusion models have recently demonstrated their effectiveness in generating +extremely high-quality images and are now utilized in a wide range of +applications, including automatic sketch colorization. Although many methods +have been developed for guided sketch colorization, there has been limited +exploration of the potential conflicts between image prompts and sketch inputs, +which can lead to severe deterioration in the results. Therefore, this paper +exhaustively investigates reference-based sketch colorization models that aim +to colorize sketch images using reference color images. We specifically +investigate two critical aspects of reference-based diffusion models: the +"distribution problem", which is a major shortcoming compared to text-based +counterparts, and the capability in zero-shot sequential text-based +manipulation. We introduce two variations of an image-guided latent diffusion +model utilizing different image tokens from the pre-trained CLIP image encoder +and propose corresponding manipulation methods to adjust their results +sequentially using weighted text inputs. We conduct comprehensive evaluations +of our models through qualitative and quantitative experiments as well as a +user study. + +
+
+
+
+
+ + ♻ ☆ Forward Learning for Gradient-based Black-box Saliency Map Generation + + +
+ Gradient-based saliency maps are widely used to explain deep neural network +decisions. However, as models become deeper and more black-box, such as in +closed-source APIs like ChatGPT, computing gradients become challenging, +hindering conventional explanation methods. In this work, we introduce a novel +unified framework for estimating gradients in black-box settings and generating +saliency maps to interpret model decisions. We employ the likelihood ratio +method to estimate output-to-input gradients and utilize them for saliency map +generation. Additionally, we propose blockwise computation techniques to +enhance estimation accuracy. Extensive experiments in black-box settings +validate the effectiveness of our method, demonstrating accurate gradient +estimation and explainability of generated saliency maps. Furthermore, we +showcase the scalability of our approach by applying it to explain GPT-Vision, +revealing the continued relevance of gradient-based explanation methods in the +era of large, closed-source, and black-box models. + +
+
+ comment: The evaluation is based on small datasets and limited models, of + which bias leads to misleading conclusions +
+
+
+
+
+ + ♻ ☆ SINCERE: Supervised Information Noise-Contrastive Estimation REvisited + + +
+ The information noise-contrastive estimation (InfoNCE) loss function provides +the basis of many self-supervised deep learning methods due to its strong +empirical results and theoretic motivation. Previous work suggests a supervised +contrastive (SupCon) loss to extend InfoNCE to learn from available class +labels. This SupCon loss has been widely-used due to reports of good empirical +performance. However, in this work we find that the prior SupCon loss +formulation has questionable justification because it can encourage some images +from the same class to repel one another in the learned embedding space. This +problematic intra-class repulsion gets worse as the number of images sharing +one class label increases. We propose the Supervised InfoNCE REvisited +(SINCERE) loss as a theoretically-justified supervised extension of InfoNCE +that eliminates intra-class repulsion. Experiments show that SINCERE leads to +better separation of embeddings from different classes and improves transfer +learning classification accuracy. We additionally utilize probabilistic +modeling to derive an information-theoretic bound that relates SINCERE loss to +the symmeterized KL divergence between data-generating distributions for a +target class and all other classes. + +
+
+
+
+
+ + ♻ ☆ StructLDM: Structured Latent Diffusion for 3D Human Generation ECCV 2024 + + +
+ Recent 3D human generative models have achieved remarkable progress by +learning 3D-aware GANs from 2D images. However, existing 3D human generative +methods model humans in a compact 1D latent space, ignoring the articulated +structure and semantics of human body topology. In this paper, we explore more +expressive and higher-dimensional latent space for 3D human modeling and +propose StructLDM, a diffusion-based unconditional 3D human generative model, +which is learned from 2D images. StructLDM solves the challenges imposed due to +the high-dimensional growth of latent space with three key designs: 1) A +semantic structured latent space defined on the dense surface manifold of a +statistical human body template. 2) A structured 3D-aware auto-decoder that +factorizes the global latent space into several semantic body parts +parameterized by a set of conditional structured local NeRFs anchored to the +body template, which embeds the properties learned from the 2D training data +and can be decoded to render view-consistent humans under different poses and +clothing styles. 3) A structured latent diffusion model for generative human +appearance sampling. Extensive experiments validate StructLDM's +state-of-the-art generation performance and illustrate the expressiveness of +the structured latent space over the well-adopted 1D latent space. Notably, +StructLDM enables different levels of controllable 3D human generation and +editing, including pose/view/shape control, and high-level tasks including +compositional generations, part-aware clothing editing, 3D virtual try-on, etc. +Our project page is at: https://taohuumd.github.io/projects/StructLDM/. + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://taohuumd.github.io/projects/StructLDM/ +
+
+
+
+
+ + ♻ ☆ Greedy-DiM: Greedy Algorithms for Unreasonably Effective Face Morphs + + +
+ Morphing attacks are an emerging threat to state-of-the-art Face Recognition +(FR) systems, which aim to create a single image that contains the biometric +information of multiple identities. Diffusion Morphs (DiM) are a recently +proposed morphing attack that has achieved state-of-the-art performance for +representation-based morphing attacks. However, none of the existing research +on DiMs have leveraged the iterative nature of DiMs and left the DiM model as a +black box, treating it no differently than one would a Generative Adversarial +Network (GAN) or Varational AutoEncoder (VAE). We propose a greedy strategy on +the iterative sampling process of DiM models which searches for an optimal step +guided by an identity-based heuristic function. We compare our proposed +algorithm against ten other state-of-the-art morphing algorithms using the +open-source SYN-MAD 2022 competition dataset. We find that our proposed +algorithm is unreasonably effective, fooling all of the tested FR systems with +an MMPMR of 100%, outperforming all other morphing algorithms compared. + +
+
+ comment: Accepted as a conference paper at IJCB 2024 +
+
+
+
+
+ + ♻ ☆ SERNet-Former: Semantic Segmentation by Efficient Residual Network with + Attention-Boosting Gates and Attention-Fusion Networks + + +
+ Improving the efficiency of state-of-the-art methods in semantic segmentation +requires overcoming the increasing computational cost as well as issues such as +fusing semantic information from global and local contexts. Based on the recent +success and problems that convolutional neural networks (CNNs) encounter in +semantic segmentation, this research proposes an encoder-decoder architecture +with a unique efficient residual network, Efficient-ResNet. Attention-boosting +gates (AbGs) and attention-boosting modules (AbMs) are deployed by aiming to +fuse the equivariant and feature-based semantic information with the equivalent +sizes of the output of global context of the efficient residual network in the +encoder. Respectively, the decoder network is developed with the additional +attention-fusion networks (AfNs) inspired by AbM. AfNs are designed to improve +the efficiency in the one-to-one conversion of the semantic information by +deploying additional convolution layers in the decoder part. Our network is +tested on the challenging CamVid and Cityscapes datasets, and the proposed +methods reveal significant improvements on the residual networks. To the best +of our knowledge, the developed network, SERNet-Former, achieves +state-of-the-art results (84.62 % mean IoU) on CamVid dataset and challenging +results (87.35 % mean IoU) on Cityscapes validation dataset. + +
+
+
+
+
+ + ♻ ☆ xLSTM-UNet can be an Effective 2D & 3D Medical Image Segmentation + Backbone with Vision-LSTM (ViL) better than its Mamba Counterpart + + +
+ Convolutional Neural Networks (CNNs) and Vision Transformers (ViT) have been +pivotal in biomedical image segmentation, yet their ability to manage +long-range dependencies remains constrained by inherent locality and +computational overhead. To overcome these challenges, in this technical report, +we first propose xLSTM-UNet, a UNet structured deep learning neural network +that leverages Vision-LSTM (xLSTM) as its backbone for medical image +segmentation. xLSTM is a recently proposed as the successor of Long Short-Term +Memory (LSTM) networks and have demonstrated superior performance compared to +Transformers and State Space Models (SSMs) like Mamba in Neural Language +Processing (NLP) and image classification (as demonstrated in Vision-LSTM, or +ViL implementation). Here, xLSTM-UNet we designed extend the success in +biomedical image segmentation domain. By integrating the local feature +extraction strengths of convolutional layers with the long-range dependency +capturing abilities of xLSTM, xLSTM-UNet offers a robust solution for +comprehensive image analysis. We validate the efficacy of xLSTM-UNet through +experiments. Our findings demonstrate that xLSTM-UNet consistently surpasses +the performance of leading CNN-based, Transformer-based, and Mamba-based +segmentation networks in multiple datasets in biomedical segmentation including +organs in abdomen MRI, instruments in endoscopic images, and cells in +microscopic images. With comprehensive experiments performed, this technical +report highlights the potential of xLSTM-based architectures in advancing +biomedical image analysis in both 2D and 3D. The code, models, and datasets are +publicly available at http://tianrun-chen.github.io/xLSTM-UNet/ + +
+
+
+
+
+ + ♻ ☆ Steerable Pyramid Transform Enables Robust Left Ventricle Quantification + + +
+ Predicting cardiac indices has long been a focal point in the medical imaging +community. While various deep learning models have demonstrated success in +quantifying cardiac indices, they remain susceptible to mild input +perturbations, e.g., spatial transformations, image distortions, and +adversarial attacks. This vulnerability undermines confidence in using +learning-based automated systems for diagnosing cardiovascular diseases. In +this work, we describe a simple yet effective method to learn robust models for +left ventricle (LV) quantification, encompassing cavity and myocardium areas, +directional dimensions, and regional wall thicknesses. Our success hinges on +employing the biologically inspired steerable pyramid transform (SPT) for fixed +front-end processing, which offers three main benefits. First, the basis +functions of SPT align with the anatomical structure of LV and the geometric +features of the measured indices. Second, SPT facilitates weight sharing across +different orientations as a form of parameter regularization and naturally +captures the scale variations of LV. Third, the residual highpass subband can +be conveniently discarded, promoting robust feature learning. Extensive +experiments on the Cardiac-Dig benchmark show that our SPT-augmented model not +only achieves reasonable prediction accuracy compared to state-of-the-art +methods, but also exhibits significantly improved robustness against input +perturbations. + +
+
+ comment: Code is available at https://github.com/yangyangyang127/RobustLV +
+
+
+
+
+ + ♻ ☆ Diffusion Forcing: Next-token Prediction Meets Full-Sequence Diffusion + + +
+ This paper presents Diffusion Forcing, a new training paradigm where a +diffusion model is trained to denoise a set of tokens with independent +per-token noise levels. We apply Diffusion Forcing to sequence generative +modeling by training a causal next-token prediction model to generate one or +several future tokens without fully diffusing past ones. Our approach is shown +to combine the strengths of next-token prediction models, such as +variable-length generation, with the strengths of full-sequence diffusion +models, such as the ability to guide sampling to desirable trajectories. Our +method offers a range of additional capabilities, such as (1) rolling-out +sequences of continuous tokens, such as video, with lengths past the training +horizon, where baselines diverge and (2) new sampling and guiding schemes that +uniquely profit from Diffusion Forcing's variable-horizon and causal +architecture, and which lead to marked performance gains in decision-making and +planning tasks. In addition to its empirical success, our method is proven to +optimize a variational lower bound on the likelihoods of all subsequences of +tokens drawn from the true joint distribution. Project website: +https://boyuan.space/diffusion-forcing/ + +
+
+ comment: Project website: https://boyuan.space/diffusion-forcing/ +
+
+
+
+
+ + ♻ ☆ ReXTime: A Benchmark Suite for Reasoning-Across-Time in Videos + + +
+ We introduce ReXTime, a benchmark designed to rigorously test AI models' +ability to perform temporal reasoning within video events. Specifically, +ReXTime focuses on reasoning across time, i.e. human-like understanding when +the question and its corresponding answer occur in different video segments. +This form of reasoning, requiring advanced understanding of cause-and-effect +relationships across video segments, poses significant challenges to even the +frontier multimodal large language models. To facilitate this evaluation, we +develop an automated pipeline for generating temporal reasoning question-answer +pairs, significantly reducing the need for labor-intensive manual annotations. +Our benchmark includes 921 carefully vetted validation samples and 2,143 test +samples, each manually curated for accuracy and relevance. Evaluation results +show that while frontier large language models outperform academic models, they +still lag behind human performance by a significant 14.3% accuracy gap. +Additionally, our pipeline creates a training dataset of 9,695 machine +generated samples without manual effort, which empirical studies suggest can +enhance the across-time reasoning via fine-tuning. + +
+
+ comment: Project page: https://rextime.github.io/ +
+
+
+
+
+ + ♻ ☆ SMERF: Streamable Memory Efficient Radiance Fields for Real-Time + Large-Scene Exploration + + +
+ Recent techniques for real-time view synthesis have rapidly advanced in +fidelity and speed, and modern methods are capable of rendering +near-photorealistic scenes at interactive frame rates. At the same time, a +tension has arisen between explicit scene representations amenable to +rasterization and neural fields built on ray marching, with state-of-the-art +instances of the latter surpassing the former in quality while being +prohibitively expensive for real-time applications. In this work, we introduce +SMERF, a view synthesis approach that achieves state-of-the-art accuracy among +real-time methods on large scenes with footprints up to 300 m$^2$ at a +volumetric resolution of 3.5 mm$^3$. Our method is built upon two primary +contributions: a hierarchical model partitioning scheme, which increases model +capacity while constraining compute and memory consumption, and a distillation +training strategy that simultaneously yields high fidelity and internal +consistency. Our approach enables full six degrees of freedom (6DOF) navigation +within a web browser and renders in real-time on commodity smartphones and +laptops. Extensive experiments show that our method exceeds the current +state-of-the-art in real-time novel view synthesis by 0.78 dB on standard +benchmarks and 1.78 dB on large scenes, renders frames three orders of +magnitude faster than state-of-the-art radiance field models, and achieves +real-time performance across a wide variety of commodity devices, including +smartphones. We encourage readers to explore these models interactively at our +project website: https://smerf-3d.github.io. + +
+
+ comment: Camera Ready. Project website: https://smerf-3d.github.io +
+
+
+
+
+ + ♻ ☆ SignAvatars: A Large-scale 3D Sign Language Holistic Motion Dataset and + Benchmark ECCV2024 + + +
+ We present SignAvatars, the first large-scale, multi-prompt 3D sign language +(SL) motion dataset designed to bridge the communication gap for Deaf and +hard-of-hearing individuals. While there has been an exponentially growing +number of research regarding digital communication, the majority of existing +communication technologies primarily cater to spoken or written languages, +instead of SL, the essential communication method for Deaf and hard-of-hearing +communities. Existing SL datasets, dictionaries, and sign language production +(SLP) methods are typically limited to 2D as annotating 3D models and avatars +for SL is usually an entirely manual and labor-intensive process conducted by +SL experts, often resulting in unnatural avatars. In response to these +challenges, we compile and curate the SignAvatars dataset, which comprises +70,000 videos from 153 signers, totaling 8.34 million frames, covering both +isolated signs and continuous, co-articulated signs, with multiple prompts +including HamNoSys, spoken language, and words. To yield 3D holistic +annotations, including meshes and biomechanically-valid poses of body, hands, +and face, as well as 2D and 3D keypoints, we introduce an automated annotation +pipeline operating on our large corpus of SL videos. SignAvatars facilitates +various tasks such as 3D sign language recognition (SLR) and the novel 3D SL +production (SLP) from diverse inputs like text scripts, individual words, and +HamNoSys notation. Hence, to evaluate the potential of SignAvatars, we further +propose a unified benchmark of 3D SL holistic motion production. We believe +that this work is a significant step forward towards bringing the digital world +to the Deaf and hard-of-hearing communities as well as people interacting with +them. + +
+
+ comment: ECCV2024 14 pages; Project page available at + https://signavatars.github.io/ +
+
+
+
+
+ + ♻ ☆ SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech + Recognition + + +
+ Audio-visual speech recognition (AVSR) is a multimodal extension of automatic +speech recognition (ASR), using video as a complement to audio. In AVSR, +considerable efforts have been directed at datasets for facial features such as +lip-readings, while they often fall short in evaluating the image comprehension +capabilities in broader contexts. In this paper, we construct SlideAVSR, an +AVSR dataset using scientific paper explanation videos. SlideAVSR provides a +new benchmark where models transcribe speech utterances with texts on the +slides on the presentation recordings. As technical terminologies that are +frequent in paper explanations are notoriously challenging to transcribe +without reference texts, our SlideAVSR dataset spotlights a new aspect of AVSR +problems. As a simple yet effective baseline, we propose DocWhisper, an AVSR +model that can refer to textual information from slides, and confirm its +effectiveness on SlideAVSR. + +
+
+ comment: 3rd Workshop on Advances in Language and Vision Research (ALVR 2024) +
+
+
+
+
+ + ♻ ☆ ColPali: Efficient Document Retrieval with Vision Language Models + + +
+ Documents are visually rich structures that convey information through text, +as well as tables, figures, page layouts, or fonts. While modern document +retrieval systems exhibit strong performance on query-to-text matching, they +struggle to exploit visual cues efficiently, hindering their performance on +practical document retrieval applications such as Retrieval Augmented +Generation. To benchmark current systems on visually rich document retrieval, +we introduce the Visual Document Retrieval Benchmark ViDoRe, composed of +various page-level retrieving tasks spanning multiple domains, languages, and +settings. The inherent shortcomings of modern systems motivate the introduction +of a new retrieval model architecture, ColPali, which leverages the document +understanding capabilities of recent Vision Language Models to produce +high-quality contextualized embeddings solely from images of document pages. +Combined with a late interaction matching mechanism, ColPali largely +outperforms modern document retrieval pipelines while being drastically faster +and end-to-end trainable. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ AI Age Discrepancy: A Novel Parameter for Frailty Assessment in Kidney + Tumor Patients + + +
+ Kidney cancer is a global health concern, and accurate assessment of patient +frailty is crucial for optimizing surgical outcomes. This paper introduces AI +Age Discrepancy, a novel metric derived from machine learning analysis of +preoperative abdominal CT scans, as a potential indicator of frailty and +postoperative risk in kidney cancer patients. This retrospective study of 599 +patients from the 2023 Kidney Tumor Segmentation (KiTS) challenge dataset found +that a higher AI Age Discrepancy is significantly associated with longer +hospital stays and lower overall survival rates, independent of established +factors. This suggests that AI Age Discrepancy may provide valuable insights +into patient frailty and could thus inform clinical decision-making in kidney +cancer treatment. + +
+
+ comment: 10 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Vision-LSTM: xLSTM as Generic Vision Backbone + + +
+ Transformers are widely used as generic backbones in computer vision, despite +initially introduced for natural language processing. Recently, the Long +Short-Term Memory (LSTM) has been extended to a scalable and performant +architecture - the xLSTM - which overcomes long-standing LSTM limitations via +exponential gating and parallelizable matrix memory structure. In this report, +we introduce Vision-LSTM (ViL), an adaption of the xLSTM building blocks to +computer vision. ViL comprises a stack of xLSTM blocks where odd blocks process +the sequence of patch tokens from top to bottom while even blocks go from +bottom to top. Experiments show that ViL holds promise to be further deployed +as new generic backbone for computer vision architectures. + +
+
+
+
+
+ + ♻ ☆ Universal Semi-Supervised Learning for Medical Image Classification + + +
+ Semi-supervised learning (SSL) has attracted much attention since it reduces +the expensive costs of collecting adequate well-labeled training data, +especially for deep learning methods. However, traditional SSL is built upon an +assumption that labeled and unlabeled data should be from the same distribution +\textit{e.g.,} classes and domains. However, in practical scenarios, unlabeled +data would be from unseen classes or unseen domains, and it is still +challenging to exploit them by existing SSL methods. Therefore, in this paper, +we proposed a unified framework to leverage these unseen unlabeled data for +open-scenario semi-supervised medical image classification. We first design a +novel scoring mechanism, called dual-path outliers estimation, to identify +samples from unseen classes. Meanwhile, to extract unseen-domain samples, we +then apply an effective variational autoencoder (VAE) pre-training. After that, +we conduct domain adaptation to fully exploit the value of the detected +unseen-domain samples to boost semi-supervised training. We evaluated our +proposed framework on dermatology and ophthalmology tasks. Extensive +experiments demonstrate our model can achieve superior classification +performance in various medical SSL scenarios. The code implementations are +accessible at: https://github.com/PyJulie/USSL4MIC. + +
+
+
+
+
+ + ♻ ☆ GraphBEV: Towards Robust BEV Feature Alignment for Multi-Modal 3D Object + Detection + + +
+ Integrating LiDAR and camera information into Bird's-Eye-View (BEV) +representation has emerged as a crucial aspect of 3D object detection in +autonomous driving. However, existing methods are susceptible to the inaccurate +calibration relationship between LiDAR and the camera sensor. Such inaccuracies +result in errors in depth estimation for the camera branch, ultimately causing +misalignment between LiDAR and camera BEV features. In this work, we propose a +robust fusion framework called Graph BEV. Addressing errors caused by +inaccurate point cloud projection, we introduce a Local Align module that +employs neighbor-aware depth features via Graph matching. Additionally, we +propose a Global Align module to rectify the misalignment between LiDAR and +camera BEV features. Our Graph BEV framework achieves state-of-the-art +performance, with an mAP of 70.1\%, surpassing BEV Fusion by 1.6\% on the +nuscenes validation set. Importantly, our Graph BEV outperforms BEV Fusion by +8.3\% under conditions with misalignment noise. + +
+
+
+
+
+ + ♻ ☆ Weighted Intersection over Union (wIoU) for Evaluating Image + Segmentation + + +
+ In recent years, many semantic segmentation methods have been proposed to +predict label of pixels in the scene. In general, we measure area prediction +errors or boundary prediction errors for comparing methods. However, there is +no intuitive evaluation metric that evaluates both aspects. In this work, we +propose a new evaluation measure called weighted Intersection over Union (wIoU) +for semantic segmentation. First, it builds a weight map generated from a +boundary distance map, allowing weighted evaluation for each pixel based on a +boundary importance factor. The proposed wIoU can evaluate both contour and +region by setting a boundary importance factor. We validated the effectiveness +of wIoU on a dataset of 33 scenes and demonstrated its flexibility. Using the +proposed metric, we expect more flexible and intuitive evaluation in semantic +segmentation field are possible. + +
+
+ comment: 9 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ A reproducible 3D convolutional neural network with dual attention + module (3D-DAM) for Alzheimer's disease classification + + +
+ Alzheimer's disease is one of the most common types of neurodegenerative +disease, characterized by the accumulation of amyloid-beta plaque and tau +tangles. Recently, deep learning approaches have shown promise in Alzheimer's +disease diagnosis. In this study, we propose a reproducible model that utilizes +a 3D convolutional neural network with a dual attention module for Alzheimer's +disease classification. We trained the model in the ADNI database and verified +the generalizability of our method in two independent datasets (AIBL and +OASIS1). Our method achieved state-of-the-art classification performance, with +an accuracy of 91.94% for MCI progression classification and 96.30% for +Alzheimer's disease classification on the ADNI dataset. Furthermore, the model +demonstrated good generalizability, achieving an accuracy of 86.37% on the AIBL +dataset and 83.42% on the OASIS1 dataset. These results indicate that our +proposed approach has competitive performance and generalizability when +compared to recent studies in the field. + +
+
+
+
+
+ + ♻ ☆ Deep Imbalanced Regression to Estimate Vascular Age from PPG Data: a + Novel Digital Biomarker for Cardiovascular Health + + +
+ Photoplethysmography (PPG) is emerging as a crucial tool for monitoring human +hemodynamics, with recent studies highlighting its potential in assessing +vascular aging through deep learning. However, real-world age distributions are +often imbalanced, posing significant challenges for deep learning models. In +this paper, we introduce a novel, simple, and effective loss function named the +Dist Loss to address deep imbalanced regression tasks. We trained a +one-dimensional convolutional neural network (Net1D) incorporating the Dist +Loss on the extensive UK Biobank dataset (n=502,389) to estimate vascular age +from PPG signals and validate its efficacy in characterizing cardiovascular +health. The model's performance was validated on a 40% held-out test set, +achieving state-of-the-art results, especially in regions with small sample +sizes. Furthermore, we divided the population into three subgroups based on the +difference between predicted vascular age and chronological age: less than -10 +years, between -10 and 10 years, and greater than 10 years. We analyzed the +relationship between predicted vascular age and several cardiovascular events +over a follow-up period of up to 10 years, including death, coronary heart +disease, and heart failure. Our results indicate that the predicted vascular +age has significant potential to reflect an individual's cardiovascular health +status. Our code will be available at https://github.com/Ngk03/AI-vascular-age. + +
+
+
+
+
+ + ♻ ☆ Reconstruction of Cardiac Cine MRI Using Motion-Guided Deformable + Alignment and Multi-Resolution Fusion + + +
+ Cardiac cine magnetic resonance imaging (MRI) is one of the important means +to assess cardiac functions and vascular abnormalities. Mitigating artifacts +arising during image reconstruction and accelerating cardiac cine MRI +acquisition to obtain high-quality images is important. A novel end-to-end deep +learning network is developed to improve cardiac cine MRI reconstruction. +First, a U-Net is adopted to obtain the initial reconstructed images in +k-space. Further to remove the motion artifacts, the motion-guided deformable +alignment (MGDA) module with second-order bidirectional propagation is +introduced to align the adjacent cine MRI frames by maximizing spatial-temporal +information to alleviate motion artifacts. Finally, the multi-resolution fusion +(MRF) module is designed to correct the blur and artifacts generated from +alignment operation and obtain the last high-quality reconstructed cardiac +images. At an 8$\times$ acceleration rate, the numerical measurements on the +ACDC dataset are structural similarity index (SSIM) of 78.40%$\pm$.57%, peak +signal-to-noise ratio (PSNR) of 30.46$\pm$1.22dB, and normalized mean squared +error (NMSE) of 0.0468$\pm$0.0075. On the ACMRI dataset, the results are SSIM +of 87.65%$\pm$4.20%, PSNR of 30.04$\pm$1.18dB, and NMSE of 0.0473$\pm$0.0072. +The proposed method exhibits high-quality results with richer details and fewer +artifacts for cardiac cine MRI reconstruction on different accelerations. + +
+
+ comment: 28 pages, 5 tables, 11 figures +
+
+
+
+
+ + ♻ ☆ Instant Photorealistic Neural Radiance Fields Stylization ICASSP2024 + + +
+ We present Instant Neural Radiance Fields Stylization, a novel approach for +multi-view image stylization for the 3D scene. Our approach models a neural +radiance field based on neural graphics primitives, which use a hash +table-based position encoder for position embedding. We split the position +encoder into two parts, the content and style sub-branches, and train the +network for normal novel view image synthesis with the content and style +targets. In the inference stage, we execute AdaIN to the output features of the +position encoder, with content and style voxel grid features as reference. With +the adjusted features, the stylization of novel view images could be obtained. +Our method extends the style target from style images to image sets of scenes +and does not require additional network training for stylization. Given a set +of images of 3D scenes and a style target(a style image or another set of 3D +scenes), our method can generate stylized novel views with a consistent +appearance at various view angles in less than 10 minutes on modern GPU +hardware. Extensive experimental results demonstrate the validity and +superiority of our method. + +
+
+ comment: Accepted by ICASSP2024. Code: + https://github.com/lishaoxu1994/Instant-NeRF-Stylization +
+
+
+
+
+ + ♻ ☆ Masked Attribute Description Embedding for Cloth-Changing Person + Re-identification + + +
+ Cloth-changing person re-identification (CC-ReID) aims to match persons who +change clothes over long periods. The key challenge in CC-ReID is to extract +clothing-independent features, such as face, hairstyle, body shape, and gait. +Current research mainly focuses on modeling body shape using multi-modal +biological features (such as silhouettes and sketches). However, it does not +fully leverage the personal description information hidden in the original RGB +image. Considering that there are certain attribute descriptions which remain +unchanged after the changing of cloth, we propose a Masked Attribute +Description Embedding (MADE) method that unifies personal visual appearance and +attribute description for CC-ReID. Specifically, handling variable +clothing-sensitive information, such as color and type, is challenging for +effective modeling. To address this, we mask the clothing and color information +in the personal attribute description extracted through an attribute detection +model. The masked attribute description is then connected and embedded into +Transformer blocks at various levels, fusing it with the low-level to +high-level features of the image. This approach compels the model to discard +clothing information. Experiments are conducted on several CC-ReID benchmarks, +including PRCC, LTCC, Celeb-reID-light, and LaST. Results demonstrate that MADE +effectively utilizes attribute description, enhancing cloth-changing person +re-identification performance, and compares favorably with state-of-the-art +methods. The code is available at https://github.com/moon-wh/MADE. + +
+
+
+
+
+ + ♻ ☆ GLAD: Towards Better Reconstruction with Global and Local Adaptive + Diffusion Models for Unsupervised Anomaly Detection ECCV 2024 + + +
+ Diffusion models have shown superior performance on unsupervised anomaly +detection tasks. Since trained with normal data only, diffusion models tend to +reconstruct normal counterparts of test images with certain noises added. +However, these methods treat all potential anomalies equally, which may cause +two main problems. From the global perspective, the difficulty of +reconstructing images with different anomalies is uneven. Therefore, instead of +utilizing the same setting for all samples, we propose to predict a particular +denoising step for each sample by evaluating the difference between image +contents and the priors extracted from diffusion models. From the local +perspective, reconstructing abnormal regions differs from normal areas even in +the same image. Theoretically, the diffusion model predicts a noise for each +step, typically following a standard Gaussian distribution. However, due to the +difference between the anomaly and its potential normal counterpart, the +predicted noise in abnormal regions will inevitably deviate from the standard +Gaussian distribution. To this end, we propose introducing synthetic abnormal +samples in training to encourage the diffusion models to break through the +limitation of standard Gaussian distribution, and a spatial-adaptive feature +fusion scheme is utilized during inference. With the above modifications, we +propose a global and local adaptive diffusion model (abbreviated to GLAD) for +unsupervised anomaly detection, which introduces appealing flexibility and +achieves anomaly-free reconstruction while retaining as much normal information +as possible. Extensive experiments are conducted on three commonly used anomaly +detection datasets (MVTec-AD, MPDD, and VisA) and a printed circuit board +dataset (PCB-Bank) we integrated, showing the effectiveness of the proposed +method. + +
+
+ comment: Accepted by ECCV 2024, code and models: + https://github.com/hyao1/GLAD. Due to the limitation "The abstract field + cannot be longer than 1,920 characters", the abstract here is shorter than + that in the PDF file +
+
+
+
+
+ + ♻ ☆ Multi-level Reliable Guidance for Unpaired Multi-view Clustering + + +
+ In this paper, we address the challenging problem of unpaired multi-view +clustering (UMC), aiming to perform effective joint clustering using unpaired +observed samples across multiple views. Commonly, traditional incomplete +multi-view clustering (IMC) methods often depend on paired samples to capture +complementary information between views. However, the strategy becomes +impractical in UMC due to the absence of paired samples. Although some +researchers have attempted to tackle the issue by preserving consistent cluster +structures across views, they frequently neglect the confidence of these +cluster structures, especially for boundary samples and uncertain cluster +structures during the initial training. Therefore, we propose a method called +Multi-level Reliable Guidance for UMC (MRG-UMC), which leverages multi-level +clustering to aid in learning a trustworthy cluster structure across +inner-view, cross-view, and common-view, respectively. Specifically, within +each view, multi-level clustering fosters a trustworthy cluster structure +across different levels and reduces clustering error. In cross-view learning, +reliable view guidance enhances the confidence of the cluster structures in +other views. Similarly, within the multi-level framework, the incorporation of +a common view aids in aligning different views, thereby reducing the clustering +error and uncertainty of cluster structure. Finally, as evidenced by extensive +experiments, our method for UMC demonstrates significant efficiency +improvements compared to 20 state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ DiffStyler: Diffusion-based Localized Image Style Transfer + + +
+ Image style transfer aims to imbue digital imagery with the distinctive +attributes of style targets, such as colors, brushstrokes, shapes, whilst +concurrently preserving the semantic integrity of the content. Despite the +advancements in arbitrary style transfer methods, a prevalent challenge remains +the delicate equilibrium between content semantics and style attributes. Recent +developments in large-scale text-to-image diffusion models have heralded +unprecedented synthesis capabilities, albeit at the expense of relying on +extensive and often imprecise textual descriptions to delineate artistic +styles. Addressing these limitations, this paper introduces DiffStyler, a novel +approach that facilitates efficient and precise arbitrary image style transfer. +DiffStyler lies the utilization of a text-to-image Stable Diffusion model-based +LoRA to encapsulate the essence of style targets. This approach, coupled with +strategic cross-LoRA feature and attention injection, guides the style transfer +process. The foundation of our methodology is rooted in the observation that +LoRA maintains the spatial feature consistency of UNet, a discovery that +further inspired the development of a mask-wise style transfer technique. This +technique employs masks extracted through a pre-trained FastSAM model, +utilizing mask prompts to facilitate feature fusion during the denoising +process, thereby enabling localized style transfer that preserves the original +image's unaffected regions. Moreover, our approach accommodates multiple style +targets through the use of corresponding masks. Through extensive +experimentation, we demonstrate that DiffStyler surpasses previous methods in +achieving a more harmonious balance between content preservation and style +integration. + +
+
+ comment: https://github.com/lishaoxu1994/DiffStyler +
+
+
+
+
+ + ♻ ☆ Towards Robust Cardiac Segmentation using Graph Convolutional Networks + + +
+ Fully automatic cardiac segmentation can be a fast and reproducible method to +extract clinical measurements from an echocardiography examination. The U-Net +architecture is the current state-of-the-art deep learning architecture for +medical segmentation and can segment cardiac structures in real-time with +average errors comparable to inter-observer variability. However, this +architecture still generates large outliers that are often anatomically +incorrect. This work uses the concept of graph convolutional neural networks +that predict the contour points of the structures of interest instead of +labeling each pixel. We propose a graph architecture that uses two +convolutional rings based on cardiac anatomy and show that this eliminates +anatomical incorrect multi-structure segmentations on the publicly available +CAMUS dataset. Additionally, this work contributes with an ablation study on +the graph convolutional architecture and an evaluation of clinical measurements +on the clinical HUNT4 dataset. Finally, we propose to use the inter-model +agreement of the U-Net and the graph network as a predictor of both the input +and segmentation quality. We show this predictor can detect out-of-distribution +and unsuitable input images in real-time. Source code is available online: +https://github.com/gillesvntnu/GCN_multistructure + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Tracking Object Positions in Reinforcement Learning: A Metric for + Keypoint Detection (extended version) + + +
+ Reinforcement learning (RL) for robot control typically requires a detailed +representation of the environment state, including information about +task-relevant objects not directly measurable. Keypoint detectors, such as +spatial autoencoders (SAEs), are a common approach to extracting a +low-dimensional representation from high-dimensional image data. SAEs aim at +spatial features such as object positions, which are often useful +representations in robotic RL. However, whether an SAE is actually able to +track objects in the scene and thus yields a spatial state representation well +suited for RL tasks has rarely been examined due to a lack of established +metrics. In this paper, we propose to assess the performance of an SAE instance +by measuring how well keypoints track ground truth objects in images. We +present a computationally lightweight metric and use it to evaluate common +baseline SAE architectures on image data from a simulated robot task. We find +that common SAEs differ substantially in their spatial extraction capability. +Furthermore, we validate that SAEs that perform well in our metric achieve +superior performance when used in downstream RL. Thus, our metric is an +effective and lightweight indicator of RL performance before executing +expensive RL training. Building on these insights, we identify three key +modifications of SAE architectures to improve tracking performance. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Fixed-length Dense Descriptor for Efficient Fingerprint Matching + + +
+ In fingerprint matching, fixed-length descriptors generally offer greater +efficiency compared to minutiae set, but the recognition accuracy is not as +good as that of the latter. Although much progress has been made in deep +learning based fixed-length descriptors recently, they often fall short when +dealing with incomplete or partial fingerprints, diverse fingerprint poses, and +significant background noise. In this paper, we propose a three-dimensional +representation called Fixed-length Dense Descriptor (FDD) for efficient +fingerprint matching. FDD features great spatial properties, enabling it to +capture the spatial relationships of the original fingerprints, thereby +enhancing interpretability and robustness. Our experiments on various +fingerprint datasets reveal that FDD outperforms other fixed-length +descriptors, especially in matching fingerprints of different areas, +cross-modal fingerprint matching, and fingerprint matching with background +noise. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Preserving Full Degradation Details for Blind Image Super-Resolution + + +
+ The performance of image super-resolution relies heavily on the accuracy of +degradation information, especially under blind settings. Due to absence of +true degradation models in real-world scenarios, previous methods learn +distinct representations by distinguishing different degradations in a batch. +However, the most significant degradation differences may provide shortcuts for +the learning of representations such that subtle difference may be discarded. +In this paper, we propose an alternative to learn degradation representations +through reproducing degraded low-resolution (LR) images. By guiding the +degrader to reconstruct input LR images, full degradation information can be +encoded into the representations. In addition, we develop an energy distance +loss to facilitate the learning of the degradation representations by +introducing a bounded constraint. Experiments show that our representations can +extract accurate and highly robust degradation information. Moreover, +evaluations on both synthetic and real images demonstrate that our ReDSR +achieves state-of-the-art performance for the blind SR tasks. + +
+
+ comment: 18 pages, 11 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Learning Neural Volumetric Pose Features for Camera Localization ECCV 2024 + + +
+ We introduce a novel neural volumetric pose feature, termed PoseMap, designed +to enhance camera localization by encapsulating the information between images +and the associated camera poses. Our framework leverages an Absolute Pose +Regression (APR) architecture, together with an augmented NeRF module. This +integration not only facilitates the generation of novel views to enrich the +training dataset but also enables the learning of effective pose features. +Additionally, we extend our architecture for self-supervised online alignment, +allowing our method to be used and fine-tuned for unlabelled images within a +unified framework. Experiments demonstrate that our method achieves 14.28% and +20.51% performance gain on average in indoor and outdoor benchmark scenes, +outperforming existing APR methods with state-of-the-art accuracy. + +
+
+ comment: 14 pages, 9 figures. Accepted at ECCV 2024. Project page are coming + soon +
+
+
+
+
+ + ♻ ☆ 4D-Rotor Gaussian Splatting: Towards Efficient Novel View Synthesis for + Dynamic Scenes SIGGRAPH + + +
+ We consider the problem of novel-view synthesis (NVS) for dynamic scenes. +Recent neural approaches have accomplished exceptional NVS results for static +3D scenes, but extensions to 4D time-varying scenes remain non-trivial. Prior +efforts often encode dynamics by learning a canonical space plus implicit or +explicit deformation fields, which struggle in challenging scenarios like +sudden movements or generating high-fidelity renderings. In this paper, we +introduce 4D Gaussian Splatting (4DRotorGS), a novel method that represents +dynamic scenes with anisotropic 4D XYZT Gaussians, inspired by the success of +3D Gaussian Splatting in static scenes. We model dynamics at each timestamp by +temporally slicing the 4D Gaussians, which naturally compose dynamic 3D +Gaussians and can be seamlessly projected into images. As an explicit +spatial-temporal representation, 4DRotorGS demonstrates powerful capabilities +for modeling complicated dynamics and fine details--especially for scenes with +abrupt motions. We further implement our temporal slicing and splatting +techniques in a highly optimized CUDA acceleration framework, achieving +real-time inference rendering speeds of up to 277 FPS on an RTX 3090 GPU and +583 FPS on an RTX 4090 GPU. Rigorous evaluations on scenes with diverse motions +showcase the superior efficiency and effectiveness of 4DRotorGS, which +consistently outperforms existing methods both quantitatively and +qualitatively. + +
+
+ comment: Proc. SIGGRAPH, 2024 +
+
+
+
+
+ + ♻ ☆ PIN-SLAM: LiDAR SLAM Using a Point-Based Implicit Neural Representation + for Achieving Global Map Consistency + + +
+ Accurate and robust localization and mapping are essential components for +most autonomous robots. In this paper, we propose a SLAM system for building +globally consistent maps, called PIN-SLAM, that is based on an elastic and +compact point-based implicit neural map representation. Taking range +measurements as input, our approach alternates between incremental learning of +the local implicit signed distance field and the pose estimation given the +current local map using a correspondence-free, point-to-implicit model +registration. Our implicit map is based on sparse optimizable neural points, +which are inherently elastic and deformable with the global pose adjustment +when closing a loop. Loops are also detected using the neural point features. +Extensive experiments validate that PIN-SLAM is robust to various environments +and versatile to different range sensors such as LiDAR and RGB-D cameras. +PIN-SLAM achieves pose estimation accuracy better or on par with the +state-of-the-art LiDAR odometry or SLAM systems and outperforms the recent +neural implicit SLAM approaches while maintaining a more consistent, and highly +compact implicit map that can be reconstructed as accurate and complete meshes. +Finally, thanks to the voxel hashing for efficient neural points indexing and +the fast implicit map-based registration without closest point association, +PIN-SLAM can run at the sensor frame rate on a moderate GPU. Codes will be +available at: https://github.com/PRBonn/PIN_SLAM. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Empowering 3D Visual Grounding with Reasoning Capabilities ECCV 2024 + + +
+ Although great progress has been made in 3D visual grounding, current models +still rely on explicit textual descriptions for grounding and lack the ability +to reason human intentions from implicit instructions. We propose a new task +called 3D reasoning grounding and introduce a new benchmark ScanReason which +provides over 10K question-answer-location pairs from five reasoning types that +require the synerization of reasoning and grounding. We further design our +approach, ReGround3D, composed of the visual-centric reasoning module empowered +by Multi-modal Large Language Model (MLLM) and the 3D grounding module to +obtain accurate object locations by looking back to the enhanced geometry and +fine-grained details from the 3D scenes. A chain-of-grounding mechanism is +proposed to further boost the performance with interleaved reasoning and +grounding steps during inference. Extensive experiments on the proposed +benchmark validate the effectiveness of our proposed approach. + +
+
+ comment: Accepted by ECCV 2024. A comprehensive and hierarchical 3D reasoning + grounding benchmark in the era of foundation models. Project page: + https://zcmax.github.io/projects/ScanReason +
+
+
+
+
+ + ♻ ☆ Natural Language Can Help Bridge the Sim2Real Gap + + +
+ The main challenge in learning image-conditioned robotic policies is +acquiring a visual representation conducive to low-level control. Due to the +high dimensionality of the image space, learning a good visual representation +requires a considerable amount of visual data. However, when learning in the +real world, data is expensive. Sim2Real is a promising paradigm for overcoming +data scarcity in the real-world target domain by using a simulator to collect +large amounts of cheap data closely related to the target task. However, it is +difficult to transfer an image-conditioned policy from sim to real when the +domains are very visually dissimilar. To bridge the sim2real visual gap, we +propose using natural language descriptions of images as a unifying signal +across domains that captures the underlying task-relevant semantics. Our key +insight is that if two image observations from different domains are labeled +with similar language, the policy should predict similar action distributions +for both images. We demonstrate that training the image encoder to predict the +language description or the distance between descriptions of a sim or real +image serves as a useful, data-efficient pretraining step that helps learn a +domain-invariant image representation. We can then use this image encoder as +the backbone of an IL policy trained simultaneously on a large amount of +simulated and a handful of real demonstrations. Our approach outperforms widely +used prior sim2real methods and strong vision-language pretraining baselines +like CLIP and R3M by 25 to 40%. See additional videos and materials at +https://robin-lab.cs.utexas.edu/lang4sim2real/. + +
+
+ comment: To appear in RSS 2024. Project website at + https://robin-lab.cs.utexas.edu/lang4sim2real/ +
+
+
+
+
+ + ♻ ☆ Estimating Noisy Class Posterior with Part-level Labels for Noisy Label + Learning CVPR 2024 + + +
+ In noisy label learning, estimating noisy class posteriors plays a +fundamental role for developing consistent classifiers, as it forms the basis +for estimating clean class posteriors and the transition matrix. Existing +methods typically learn noisy class posteriors by training a classification +model with noisy labels. However, when labels are incorrect, these models may +be misled to overemphasize the feature parts that do not reflect the instance +characteristics, resulting in significant errors in estimating noisy class +posteriors. To address this issue, this paper proposes to augment the +supervised information with part-level labels, encouraging the model to focus +on and integrate richer information from various parts. Specifically, our +method first partitions features into distinct parts by cropping instances, +yielding part-level labels associated with these various parts. Subsequently, +we introduce a novel single-to-multiple transition matrix to model the +relationship between the noisy and part-level labels, which incorporates +part-level labels into a classifier-consistent framework. Utilizing this +framework with part-level labels, we can learn the noisy class posteriors more +precisely by guiding the model to integrate information from various parts, +ultimately improving the classification performance. Our method is +theoretically sound, while experiments show that it is empirically effective in +synthetic and real-world noisy benchmarks. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ VDD: Varied Drone Dataset for Semantic Segmentation + + +
+ Semantic segmentation of drone images is critical for various aerial vision +tasks as it provides essential semantic details to understand scenes on the +ground. Ensuring high accuracy of semantic segmentation models for drones +requires access to diverse, large-scale, and high-resolution datasets, which +are often scarce in the field of aerial image processing. While existing +datasets typically focus on urban scenes and are relatively small, our Varied +Drone Dataset (VDD) addresses these limitations by offering a large-scale, +densely labeled collection of 400 high-resolution images spanning 7 classes. +This dataset features various scenes in urban, industrial, rural, and natural +areas, captured from different camera angles and under diverse lighting +conditions. We also make new annotations to UDD and UAVid, integrating them +under VDD annotation standards, to create the Integrated Drone Dataset (IDD). +We train seven state-of-the-art models on drone datasets as baselines. It's +expected that our dataset will generate considerable interest in drone image +segmentation and serve as a foundation for other drone vision tasks. Datasets +are publicly available at \href{our website}{https://github.com/RussRobin/VDD}. + +
+
+
+
+
+ + ♻ ☆ Embedded Prompt Tuning: Towards Enhanced Calibration of Pretrained + Models for Medical Images + + +
+ Foundation models pre-trained on large-scale data have been widely witnessed +to achieve success in various natural imaging downstream tasks. +Parameter-efficient fine-tuning (PEFT) methods aim to adapt foundation models +to new domains by updating only a small portion of parameters in order to +reduce computational overhead. However, the effectiveness of these PEFT +methods, especially in cross-domain few-shot scenarios, e.g., medical image +analysis, has not been fully explored. In this work, we facilitate the study of +the performance of PEFT when adapting foundation models to medical image +classification tasks. Furthermore, to alleviate the limitations of prompt +introducing ways and approximation capabilities on Transformer architectures of +mainstream prompt tuning methods, we propose the Embedded Prompt Tuning (EPT) +method by embedding prompt tokens into the expanded channels. We also find that +there are anomalies in the feature space distribution of foundation models +during pre-training process, and prompt tuning can help mitigate this negative +impact. To explain this phenomenon, we also introduce a novel perspective to +understand prompt tuning: Prompt tuning is a distribution calibrator. And we +support it by analyzing patch-wise scaling and feature separation operations +contained in EPT. Our experiments show that EPT outperforms several +state-of-the-art fine-tuning methods by a significant margin on few-shot +medical image classification tasks, and completes the fine-tuning process +within highly competitive time, indicating EPT is an effective PEFT method. The +source code is available at github.com/zuwenqiang/EPT. + +
+
+ comment: 16 pages, 7 figures. arXiv admin note: text overlap with + arXiv:2306.09579, arXiv:2203.12119 by other authors +
+
+
+
+
+ + ♻ ☆ C3L: Content Correlated Vision-Language Instruction Tuning Data + Generation via Contrastive Learning IJCAI-24 + + +
+ Vision-Language Instruction Tuning (VLIT) is a critical training phase for +Large Vision-Language Models (LVLMs). With the improving capabilities of +open-source LVLMs, researchers have increasingly turned to generate VLIT data +by using open-source LVLMs and achieved significant progress. However, such +data generation approaches are bottlenecked by the following challenges: 1) +Since multi-modal models tend to be influenced by prior language knowledge, +directly using LVLMs to generate VLIT data would inevitably lead to low content +relevance between generated data and images. 2) To improve the ability of the +models to generate VLIT data, previous methods have incorporated an additional +training phase to boost the generative capacity. This process hurts the +generalization of the models to unseen inputs (i.e., "exposure bias" problem). +In this paper, we propose a new Content Correlated VLIT data generation via +Contrastive Learning (C3L). Specifically, we design a new content relevance +module which enhances the content relevance between VLIT data and images by +computing Image Instruction Correspondence Scores S(I2C). Moreover, a +contrastive learning module is introduced to further boost the VLIT data +generation capability of the LVLMs. A large number of automatic measures on +four benchmarks show the effectiveness of our method. + +
+
+ comment: Accepted by IJCAI-24 +
+
+
+
+
+ + ♻ ☆ Coding for Intelligence from the Perspective of Category + + +
+ Coding, which targets compressing and reconstructing data, and intelligence, +often regarded at an abstract computational level as being centered around +model learning and prediction, interweave recently to give birth to a series of +significant progress. The recent trends demonstrate the potential homogeneity +of these two fields, especially when deep-learning models aid these two +categories for better probability modeling. For better understanding and +describing from a unified perspective, inspired by the basic generally +recognized principles in cognitive psychology, we formulate a novel problem of +Coding for Intelligence from the category theory view. Based on the three +axioms: existence of ideal coding, existence of practical coding, and +compactness promoting generalization, we derive a general framework to +understand existing methodologies, namely that, coding captures the intrinsic +relationships of objects as much as possible, while ignoring information +irrelevant to downstream tasks. This framework helps identify the challenges +and essential elements in solving the specific derived Minimal Description +Length (MDL) optimization problem from a broader range, providing opportunities +to build a more intelligent system for handling multiple tasks/applications +with coding ideas/tools. Centering on those elements, we systematically review +recent processes of towards optimizing the MDL problem in more comprehensive +ways from data, model, and task perspectives, and reveal their impacts on the +potential CfI technical routes. After that, we also present new technique paths +to fulfill CfI and provide potential solutions with preliminary experimental +evidence. Last, further directions and remaining issues are discussed as well. +The discussion shows our theory can reveal many phenomena and insights about +large foundation models, which mutually corroborate with recent practices in +feature learning. + +
+
+
+
+
+ + ♻ ☆ Fine-grained Prompt Tuning: A Parameter and Memory Efficient Transfer + Learning Method for High-resolution Medical Image Classification MICCAI 2024 + + +
+ Parameter-efficient transfer learning (PETL) is proposed as a cost-effective +way to transfer pre-trained models to downstream tasks, avoiding the high cost +of updating entire large-scale pre-trained models (LPMs). In this work, we +present Fine-grained Prompt Tuning (FPT), a novel PETL method for medical image +classification. FPT significantly reduces memory consumption compared to other +PETL methods, especially in high-resolution input contexts. To achieve this, we +first freeze the weights of the LPM and construct a learnable lightweight side +network. The frozen LPM takes high-resolution images as input to extract +fine-grained features, while the side network is fed low-resolution images to +reduce memory usage. To allow the side network to access pre-trained knowledge, +we introduce fine-grained prompts that summarize information from the LPM +through a fusion module. Important tokens selection and preloading techniques +are employed to further reduce training cost and memory requirements. We +evaluate FPT on four medical datasets with varying sizes, modalities, and +complexities. Experimental results demonstrate that FPT achieves comparable +performance to fine-tuning the entire LPM while using only 1.8% of the +learnable parameters and 13% of the memory costs of an encoder ViT-B model with +a 512 x 512 input resolution. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Rectified Iterative Disparity for Stereo Matching + + +
+ Both uncertainty-assisted and iteration-based methods have achieved great +success in stereo matching. However, existing uncertainty estimation methods +take a single image and the corresponding disparity as input, which imposes +higher demands on the estimation network. In this paper, we propose Cost +volume-based disparity Uncertainty Estimation (UEC). Based on the rich +similarity information in the cost volume coming from the image pairs, the +proposed UEC can achieve competitive performance with low computational cost. +Secondly, we propose two methods of uncertainty-assisted disparity estimation, +Uncertainty-based Disparity Rectification (UDR) and Uncertainty-based Disparity +update Conditioning (UDC). These two methods optimise the disparity update +process of the iterative-based approach without adding extra parameters. In +addition, we propose Disparity Rectification loss that significantly improves +the accuracy of small amount of disparity updates. We present a +high-performance stereo architecture, DR Stereo, which is a combination of the +proposed methods. Experimental results from SceneFlow, KITTI, Middlebury 2014, +and ETH3D show that DR-Stereo achieves very competitive disparity estimation +performance. + +
+
+
+
+
+ + ♻ ☆ Swish-T : Enhancing Swish Activation with Tanh Bias for Improved Neural + Network Performance + + +
+ We propose the Swish-T family, an enhancement of the existing non-monotonic +activation function Swish. Swish-T is defined by adding a Tanh bias to the +original Swish function. This modification creates a family of Swish-T +variants, each designed to excel in different tasks, showcasing specific +advantages depending on the application context. The Tanh bias allows for +broader acceptance of negative values during initial training stages, offering +a smoother non-monotonic curve than the original Swish. We ultimately propose +the Swish-T$_{\textbf{C}}$ function, while Swish-T and Swish-T$_{\textbf{B}}$, +byproducts of Swish-T$_{\textbf{C}}$, also demonstrate satisfactory +performance. Furthermore, our ablation study shows that using +Swish-T$_{\textbf{C}}$ as a non-parametric function can still achieve high +performance. The superiority of the Swish-T family has been empirically +demonstrated across various models and benchmark datasets, including MNIST, +Fashion MNIST, SVHN, CIFAR-10, and CIFAR-100. The code is publicly available at +"https://github.com/ictseoyoungmin/Swish-T-pytorch". + +
+
+ comment: 11 pages, 6 figures Revised the derivative of the sigmoid function + from 1-sigmoid to sigmoid(1-sigmoid) for correctness.Updated related + equations in Section 3.2 Conclusions to Conclusion in Section 6 +
+
+
+
+
+ + ♻ ☆ Augmenting Efficient Real-time Surgical Instrument Segmentation in Video + with Point Tracking and Segment Anything + + +
+ The Segment Anything Model (SAM) is a powerful vision foundation model that +is revolutionizing the traditional paradigm of segmentation. Despite this, a +reliance on prompting each frame and large computational cost limit its usage +in robotically assisted surgery. Applications, such as augmented reality +guidance, require little user intervention along with efficient inference to be +usable clinically. In this study, we address these limitations by adopting +lightweight SAM variants to meet the efficiency requirement and employing +fine-tuning techniques to enhance their generalization in surgical scenes. +Recent advancements in Tracking Any Point (TAP) have shown promising results in +both accuracy and efficiency, particularly when points are occluded or leave +the field of view. Inspired by this progress, we present a novel framework that +combines an online point tracker with a lightweight SAM model that is +fine-tuned for surgical instrument segmentation. Sparse points within the +region of interest are tracked and used to prompt SAM throughout the video +sequence, providing temporal consistency. The quantitative results surpass the +state-of-the-art semi-supervised video object segmentation method XMem on the +EndoVis 2015 dataset with 84.8 IoU and 91.0 Dice. Our method achieves promising +performance that is comparable to XMem and transformer-based fully supervised +segmentation methods on ex vivo UCL dVRK and in vivo CholecSeg8k datasets. In +addition, the proposed method shows promising zero-shot generalization ability +on the label-free STIR dataset. In terms of efficiency, we tested our method on +a single GeForce RTX 4060/4090 GPU respectively, achieving an over 25/90 FPS +inference speed. Code is available at: +https://github.com/wuzijian1997/SIS-PT-SAM + +
+
+ comment: 6 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Metric-guided Image Reconstruction Bounds via Conformal Prediction + + +
+ Recent advancements in machine learning have led to the development of novel +medical imaging systems and algorithms that address ill-posed problems. +Assessing their trustworthiness and understanding how to deploy them safely at +test time remains an important and open problem. In this work, we propose using +conformal prediction to compute valid and distribution-free bounds on +downstream metrics given reconstructions generated by one algorithm, and +retrieve upper/lower bounds and inlier/outlier reconstructions according to the +adjusted bounds. Our work offers 1) test time image reconstruction evaluation +without ground truth, 2) downstream performance guarantees, 3) meaningful +upper/lower bound reconstructions, and 4) meaningful statistical +inliers/outlier reconstructions. We demonstrate our method on post-mastectomy +radiotherapy planning using 3D breast CT reconstructions, and show 1) that +metric-guided bounds have valid coverage for downstream metrics while +conventional pixel-wise bounds do not and 2) anatomical differences of +upper/lower bounds between metric-guided and pixel-wise methods. Our work paves +way for more meaningful and trustworthy test-time evaluation of medical image +reconstructions. Code available at +https://github.com/matthewyccheung/conformal-metric + +
+
+
+
+
+ + ♻ ☆ Min-Max-Jump distance and its applications + + +
+ We explore three applications of Min-Max-Jump distance (MMJ distance). +MMJ-based K-means revises K-means with MMJ distance. MMJ-based Silhouette +coefficient revises Silhouette coefficient with MMJ distance. We also tested +the Clustering with Neural Network and Index (CNNI) model with MMJ-based +Silhouette coefficient. In the last application, we tested using Min-Max-Jump +distance for predicting labels of new points, after a clustering analysis of +data. Result shows Min-Max-Jump distance achieves good performances in all the +three proposed applications. In addition, we devise several algorithms for +calculating or estimating the distance. + +
+
+
+
+
+ + ♻ ☆ Slim-neck by GSConv: A lightweight-design for real-time detector + architectures + + +
+ Real-time object detection is significant for industrial and research fields. +On edge devices, a giant model is difficult to achieve the real-time detecting +requirement and a lightweight model built from a large number of the depth-wise +separable convolutional could not achieve the sufficient accuracy. We introduce +a new lightweight convolutional technique, GSConv, to lighten the model but +maintain the accuracy. The GSConv accomplishes an excellent trade-off between +the accuracy and speed. Furthermore, we provide a design suggestion based on +the GSConv, Slim-Neck (SNs), to achieve a higher computational +cost-effectiveness of the real-time detectors. The effectiveness of the SNs was +robustly demonstrated in over twenty sets comparative experiments. In +particular, the real-time detectors of ameliorated by the SNs obtain the +state-of-the-art (70.9% AP50 for the SODA10M at a speed of ~ 100FPS on a Tesla +T4) compared with the baselines. Code is available at +https://github.com/alanli1997/slim-neck-by-gsconv + +
+
+
+
+
+ + ♻ ☆ Evidential Uncertainty Sets in Deep Classifiers Using Conformal + Prediction + + +
+ In this paper, we propose Evidential Conformal Prediction (ECP) method for +image classifiers to generate the conformal prediction sets. Our method is +designed based on a non-conformity score function that has its roots in +Evidential Deep Learning (EDL) as a method of quantifying model (epistemic) +uncertainty in DNN classifiers. We use evidence that are derived from the logit +values of target labels to compute the components of our non-conformity score +function: the heuristic notion of uncertainty in CP, uncertainty surprisal, and +expected utility. Our extensive experimental evaluation demonstrates that ECP +outperforms three state-of-the-art methods for generating CP sets, in terms of +their set sizes and adaptivity while maintaining the coverage of true labels. + +
+
+ comment: Accepted in 13th Symposium on Conformal and Probabilistic Prediction + with Applications (COPA2024). To be published in the Proceedings of Machine + Learning Research (PMLR), vol. 230, 2024 (24 Pages) +
+
+
+
+
+ + ♻ ☆ Revisiting Backdoor Attacks against Large Vision-Language Models + + +
+ Instruction tuning enhances large vision-language models (LVLMs) but raises +security risks through potential backdoor attacks due to their openness. +Previous backdoor studies focus on enclosed scenarios with consistent training +and testing instructions, neglecting the practical domain gaps that could +affect attack effectiveness. This paper empirically examines the +generalizability of backdoor attacks during the instruction tuning of LVLMs for +the first time, revealing certain limitations of most backdoor strategies in +practical scenarios. We quantitatively evaluate the generalizability of six +typical backdoor attacks on image caption benchmarks across multiple LVLMs, +considering both visual and textual domain offsets. Our findings indicate that +attack generalizability is positively correlated with the backdoor trigger's +irrelevance to specific images/models and the preferential correlation of the +trigger pattern. Additionally, we modify existing backdoor attacks based on the +above key observations, demonstrating significant improvements in cross-domain +scenario generalizability (+86% attack success rate). Notably, even without +access to the instruction datasets, a multimodal instruction set can be +successfully poisoned with a very low poisoning rate (0.2%), achieving an +attack success rate of over 97%. This paper underscores that even simple +traditional backdoor strategies pose a serious threat to LVLMs, necessitating +more attention and in-depth research. + +
+
+ comment: 24 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Uformer-ICS: A U-Shaped Transformer for Image Compressive Sensing + Service + + +
+ Many service computing applications require real-time dataset collection from +multiple devices, necessitating efficient sampling techniques to reduce +bandwidth and storage pressure. Compressive sensing (CS) has found wide-ranging +applications in image acquisition and reconstruction. Recently, numerous +deep-learning methods have been introduced for CS tasks. However, the accurate +reconstruction of images from measurements remains a significant challenge, +especially at low sampling rates. In this paper, we propose Uformer-ICS as a +novel U-shaped transformer for image CS tasks by introducing inner +characteristics of CS into transformer architecture. To utilize the uneven +sparsity distribution of image blocks, we design an adaptive sampling +architecture that allocates measurement resources based on the estimated block +sparsity, allowing the compressed results to retain maximum information from +the original image. Additionally, we introduce a multi-channel projection (MCP) +module inspired by traditional CS optimization methods. By integrating the MCP +module into the transformer blocks, we construct projection-based transformer +blocks, and then form a symmetrical reconstruction model using these blocks and +residual convolutional blocks. Therefore, our reconstruction model can +simultaneously utilize the local features and long-range dependencies of image, +and the prior projection knowledge of CS theory. + Experimental results demonstrate its significantly better reconstruction +performance than state-of-the-art deep learning-based CS methods. + +
+
+
+
+
+ + ♻ ☆ PYRA: Parallel Yielding Re-Activation for Training-Inference Efficient + Task Adaptation ECCV 2024 + + +
+ Recently, the scale of transformers has grown rapidly, which introduces +considerable challenges in terms of training overhead and inference efficiency +in the scope of task adaptation. Existing works, namely Parameter-Efficient +Fine-Tuning (PEFT) and model compression, have separately investigated the +challenges. However, PEFT cannot guarantee the inference efficiency of the +original backbone, especially for large-scale models. Model compression +requires significant training costs for structure searching and re-training. +Consequently, a simple combination of them cannot guarantee accomplishing both +training efficiency and inference efficiency with minimal costs. In this paper, +we propose a novel Parallel Yielding Re-Activation (PYRA) method for such a +challenge of training-inference efficient task adaptation. PYRA first utilizes +parallel yielding adaptive weights to comprehensively perceive the data +distribution in downstream tasks. A re-activation strategy for token modulation +is then applied for tokens to be merged, leading to calibrated token features. +Extensive experiments demonstrate that PYRA outperforms all competing methods +under both low compression rate and high compression rate, demonstrating its +effectiveness and superiority in maintaining both training efficiency and +inference efficiency for large-scale foundation models. Our code will be +released to the public. + +
+
+ comment: 15 pages, 5 figures, Accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ MuirBench: A Comprehensive Benchmark for Robust Multi-image + Understanding + + +
+ We introduce MuirBench, a comprehensive benchmark that focuses on robust +multi-image understanding capabilities of multimodal LLMs. MuirBench consists +of 12 diverse multi-image tasks (e.g., scene understanding, ordering) that +involve 10 categories of multi-image relations (e.g., multiview, temporal +relations). Comprising 11,264 images and 2,600 multiple-choice questions, +MuirBench is created in a pairwise manner, where each standard instance is +paired with an unanswerable variant that has minimal semantic differences, in +order for a reliable assessment. Evaluated upon 20 recent multi-modal LLMs, our +results reveal that even the best-performing models like GPT-4o and Gemini Pro +find it challenging to solve MuirBench, achieving 68.0% and 49.3% in accuracy. +Open-source multimodal LLMs trained on single images can hardly generalize to +multi-image questions, hovering below 33.3% in accuracy. These results +highlight the importance of MuirBench in encouraging the community to develop +multimodal LLMs that can look beyond a single image, suggesting potential +pathways for future improvements. + +
+
+ comment: typos corrected, references added, Project Page: + https://muirbench.github.io/ +
+
+
+
+
+ + ♻ ☆ AutoRT: Embodied Foundation Models for Large Scale Orchestration of + Robotic Agents ICRA 2024 + + +
+ Foundation models that incorporate language, vision, and more recently +actions have revolutionized the ability to harness internet scale data to +reason about useful tasks. However, one of the key challenges of training +embodied foundation models is the lack of data grounded in the physical world. +In this paper, we propose AutoRT, a system that leverages existing foundation +models to scale up the deployment of operational robots in completely unseen +scenarios with minimal human supervision. AutoRT leverages vision-language +models (VLMs) for scene understanding and grounding, and further uses large +language models (LLMs) for proposing diverse and novel instructions to be +performed by a fleet of robots. Guiding data collection by tapping into the +knowledge of foundation models enables AutoRT to effectively reason about +autonomy tradeoffs and safety while significantly scaling up data collection +for robot learning. We demonstrate AutoRT proposing instructions to over 20 +robots across multiple buildings and collecting 77k real robot episodes via +both teleoperation and autonomous robot policies. We experimentally show that +such "in-the-wild" data collected by AutoRT is significantly more diverse, and +that AutoRT's use of LLMs allows for instruction following data collection +robots that can align to human preferences. + +
+
+ comment: 26 pages, 9 figures, ICRA 2024 VLMNM Workshop +
+
+
+
+
+
+
+
+ + Information Retrieval 18 + +
+
+
+ + ☆ RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in + LLMs + + +
+ Large language models (LLMs) typically utilize the top-k contexts from a +retriever in retrieval-augmented generation (RAG). In this work, we propose a +novel instruction fine-tuning framework RankRAG, which instruction-tunes a +single LLM for the dual purpose of context ranking and answer generation in +RAG. In particular, the instruction-tuned LLMs work surprisingly well by adding +a small fraction of ranking data into the training blend, and outperform +existing expert ranking models, including the same LLM exclusively fine-tuned +on a large amount of ranking data. For generation, we compare our model with +many strong baselines, including GPT-4-0613, GPT-4-turbo-2024-0409, and +ChatQA-1.5, an open-sourced model with the state-of-the-art performance on RAG +benchmarks. Specifically, our Llama3-RankRAG significantly outperforms +Llama3-ChatQA-1.5 and GPT-4 models on nine knowledge-intensive benchmarks. In +addition, it also performs comparably to GPT-4 on five RAG benchmarks in the +biomedical domain without instruction fine-tuning on biomedical data, +demonstrating its superb capability for generalization to new domains. + +
+
+
+
+
+ + ☆ Reliable Confidence Intervals for Information Retrieval Evaluation Using + Generative A.I KDD '24 + + +
+ The traditional evaluation of information retrieval (IR) systems is generally +very costly as it requires manual relevance annotation from human experts. +Recent advancements in generative artificial intelligence -- specifically large +language models (LLMs) -- can generate relevance annotations at an enormous +scale with relatively small computational costs. Potentially, this could +alleviate the costs traditionally associated with IR evaluation and make it +applicable to numerous low-resource applications. However, generated relevance +annotations are not immune to (systematic) errors, and as a result, directly +using them for evaluation produces unreliable results. + In this work, we propose two methods based on prediction-powered inference +and conformal risk control that utilize computer-generated relevance +annotations to place reliable confidence intervals (CIs) around IR evaluation +metrics. Our proposed methods require a small number of reliable annotations +from which the methods can statistically analyze the errors in the generated +annotations. Using this information, we can place CIs around evaluation metrics +with strong theoretical guarantees. Unlike existing approaches, our conformal +risk control method is specifically designed for ranking metrics and can vary +its CIs per query and document. Our experimental results show that our CIs +accurately capture both the variance and bias in evaluation based on LLM +annotations, better than the typical empirical bootstrapping estimates. We hope +our contributions bring reliable evaluation to the many IR applications where +this was traditionally infeasible. + +
+
+ comment: KDD '24 +
+
+
+
+
+ + ☆ Towards Training Music Taggers on Synthetic Data + + +
+ Most contemporary music tagging systems rely on large volumes of annotated +data. As an alternative, we investigate the extent to which synthetically +generated music excerpts can improve tagging systems when only small annotated +collections are available. To this end, we release GTZAN-synth, a synthetic +dataset that follows the taxonomy of the well-known GTZAN dataset while being +ten times larger in data volume. We first observe that simply adding this +synthetic dataset to the training split of GTZAN does not result into +performance improvements. We then proceed to investigating domain adaptation, +transfer learning and fine-tuning strategies for the task at hand and draw the +conclusion that the last two options yield an increase in accuracy. Overall, +the proposed approach can be considered as a first guide in a promising field +for future research. + +
+
+ comment: 6 pages, 3 figures, accepted to 21st International Conference on + Content-based Multimedia Indexing (CBMI) 2024, code available + https://github.com/NadineKroher/music-tagging-synthetic-data-cbmi-2024 +
+
+
+
+
+ + ☆ Joint-Dataset Learning and Cross-Consistent Regularization for + Text-to-Motion Retrieval + + +
+ Pose-estimation methods enable extracting human motion from common videos in +the structured form of 3D skeleton sequences. Despite great application +opportunities, effective content-based access to such spatio-temporal motion +data is a challenging problem. In this paper, we focus on the recently +introduced text-motion retrieval tasks, which aim to search for database +motions that are the most relevant to a specified natural-language textual +description (text-to-motion) and vice-versa (motion-to-text). Despite recent +efforts to explore these promising avenues, a primary challenge remains the +insufficient data available to train robust text-motion models effectively. To +address this issue, we propose to investigate joint-dataset learning - where we +train on multiple text-motion datasets simultaneously - together with the +introduction of a Cross-Consistent Contrastive Loss function (CCCL), which +regularizes the learned text-motion common space by imposing uni-modal +constraints that augment the representation ability of the trained network. To +learn a proper motion representation, we also introduce a transformer-based +motion encoder, called MoT++, which employs spatio-temporal attention to +process sequences of skeleton data. We demonstrate the benefits of the proposed +approaches on the widely-used KIT Motion-Language and HumanML3D datasets. We +perform detailed experimentation on joint-dataset learning and cross-dataset +scenarios, showing the effectiveness of each introduced module in a carefully +conducted ablation study and, in turn, pointing out the limitations of +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Why does in-context learning fail sometimes? Evaluating in-context + learning on open and closed questions + + +
+ We measure the performance of in-context learning as a function of task +novelty and difficulty for open and closed questions. For that purpose, we +created a novel benchmark consisting of hard scientific questions, each paired +with a context of various relevancy. We show that counter-intuitively, a +context that is more aligned with the topic does not always help more than a +less relevant context. This effect is especially visible for open questions and +questions of high difficulty or novelty. This result reveals a fundamental +difference between the treatment of close-form and open-form questions by +large-language models and shows a need for a more robust evaluation of +in-context learning on the variety of different types of questions. It also +poses a new question of how to optimally select a context for large language +models, especially in the context of Retrieval Augmented Generation (RAG) +systems. Our results suggest that the answer to this question can be highly +application-dependent and might be contingent on factors including the format +of the question, the perceived difficulty level of the questions, and the +novelty or popularity of the information we seek. + +
+
+ comment: 8 pages plus references, 4 main figures, 6 pages of supplementary + material +
+
+
+
+
+ + ☆ Simple Augmentations of Logical Rules for Neuro-Symbolic Knowledge Graph + Completion ACL 2023 + + +
+ High-quality and high-coverage rule sets are imperative to the success of +Neuro-Symbolic Knowledge Graph Completion (NS-KGC) models, because they form +the basis of all symbolic inferences. Recent literature builds neural models +for generating rule sets, however, preliminary experiments show that they +struggle with maintaining high coverage. In this work, we suggest three simple +augmentations to existing rule sets: (1) transforming rules to their abductive +forms, (2) generating equivalent rules that use inverse forms of constituent +relations and (3) random walks that propose new rules. Finally, we prune +potentially low quality rules. Experiments over four datasets and five +ruleset-baseline settings suggest that these simple augmentations consistently +improve results, and obtain up to 7.1 pt MRR and 8.5 pt Hits@1 gains over using +rules without augmentations. + +
+
+ comment: 12 pages, 15 tables Published in ACL 2023 +
+
+
+
+
+ + ☆ MeMemo: On-device Retrieval Augmentation for Private and Personalized + Text Generation SIGIR 2024 + + +
+ Retrieval-augmented text generation (RAG) addresses the common limitations of +large language models (LLMs), such as hallucination, by retrieving information +from an updatable external knowledge base. However, existing approaches often +require dedicated backend servers for data storage and retrieval, thereby +limiting their applicability in use cases that require strict data privacy, +such as personal finance, education, and medicine. To address the pressing need +for client-side dense retrieval, we introduce MeMemo, the first open-source +JavaScript toolkit that adapts the state-of-the-art approximate nearest +neighbor search technique HNSW to browser environments. Developed with modern +and native Web technologies, such as IndexedDB and Web Workers, our toolkit +leverages client-side hardware capabilities to enable researchers and +developers to efficiently search through millions of high-dimensional vectors +in the browser. MeMemo enables exciting new design and research opportunities, +such as private and personalized content creation and interactive prototyping, +as demonstrated in our example application RAG Playground. Reflecting on our +work, we discuss the opportunities and challenges for on-device dense +retrieval. MeMemo is available at https://github.com/poloclub/mememo. + +
+
+ comment: Accepted to SIGIR 2024. 6 pages, 2 figures. For a live demo, visit + https://poloclub.github.io/mememo/. Code is open-source at + https://github.com/poloclub/mememo +
+
+
+
+
+ + ☆ AdaCQR: Enhancing Query Reformulation for Conversational Search via + Sparse and Dense Retrieval Alignment + + +
+ Conversational Query Reformulation (CQR) has significantly advanced in +addressing the challenges of conversational search, particularly those stemming +from the latent user intent and the need for historical context. Recent works +aimed to boost the performance of CRQ through alignment. However, they are +designed for one specific retrieval system, which potentially results in poor +generalization. To overcome this limitation, we present a novel framework +AdaCQR. By aligning reformulation models with both term-based and +semantic-based retrieval systems, AdaCQR enhances the generalizability of +information-seeking queries across diverse retrieval environments through a +dual-phase training strategy. We also developed two effective approaches for +acquiring superior labels and diverse input candidates, boosting the efficiency +and robustness of the framework. Experimental evaluations on the TopiOCQA and +QReCC datasets demonstrate that AdaCQR significantly outperforms existing +methods, offering both quantitative and qualitative improvements in +conversational query reformulation. + +
+
+
+
+
+ + ☆ LogEval: A Comprehensive Benchmark Suite for Large Language Models In + Log Analysis + + +
+ Log analysis is crucial for ensuring the orderly and stable operation of +information systems, particularly in the field of Artificial Intelligence for +IT Operations (AIOps). Large Language Models (LLMs) have demonstrated +significant potential in natural language processing tasks. In the AIOps +domain, they excel in tasks such as anomaly detection, root cause analysis of +faults, operations and maintenance script generation, and alert information +summarization. However, the performance of current LLMs in log analysis tasks +remains inadequately validated. To address this gap, we introduce LogEval, a +comprehensive benchmark suite designed to evaluate the capabilities of LLMs in +various log analysis tasks for the first time. This benchmark covers tasks such +as log parsing, log anomaly detection, log fault diagnosis, and log +summarization. LogEval evaluates each task using 4,000 publicly available log +data entries and employs 15 different prompts for each task to ensure a +thorough and fair assessment. By rigorously evaluating leading LLMs, we +demonstrate the impact of various LLM technologies on log analysis performance, +focusing on aspects such as self-consistency and few-shot contextual learning. +We also discuss findings related to model quantification, Chinese-English +question-answering evaluation, and prompt engineering. These findings provide +insights into the strengths and weaknesses of LLMs in multilingual environments +and the effectiveness of different prompt strategies. Various evaluation +methods are employed for different tasks to accurately measure the performance +of LLMs in log analysis, ensuring a comprehensive assessment. The insights +gained from LogEvals evaluation reveal the strengths and limitations of LLMs in +log analysis tasks, providing valuable guidance for researchers and +practitioners. + +
+
+
+
+
+ + ☆ ECAT: A Entire space Continual and Adaptive Transfer Learning Framework + for Cross-Domain Recommendation + + +
+ In industrial recommendation systems, there are several mini-apps designed to +meet the diverse interests and needs of users. The sample space of them is +merely a small subset of the entire space, making it challenging to train an +efficient model. In recent years, there have been many excellent studies +related to cross-domain recommendation aimed at mitigating the problem of data +sparsity. However, few of them have simultaneously considered the adaptability +of both sample and representation continual transfer setting to the target +task. To overcome the above issue, we propose a Entire space Continual and +Adaptive Transfer learning framework called ECAT which includes two core +components: First, as for sample transfer, we propose a two-stage method that +realizes a coarse-to-fine process. Specifically, we perform an initial +selection through a graph-guided method, followed by a fine-grained selection +using domain adaptation method. Second, we propose an adaptive knowledge +distillation method for continually transferring the representations from a +model that is well-trained on the entire space dataset. ECAT enables full +utilization of the entire space samples and representations under the +supervision of the target task, while avoiding negative migration. +Comprehensive experiments on real-world industrial datasets from Taobao show +that ECAT advances state-of-the-art performance on offline metrics, and brings ++13.6% CVR and +8.6% orders for Baiyibutie, a famous mini-app of Taobao. + +
+
+
+
+
+ + ♻ ☆ Reproducibility in Machine Learning-based Research: Overview, Barriers + and Drivers + + +
+ Research in various fields is currently experiencing challenges regarding the +reproducibility of results. This problem is also prevalent in machine learning +(ML) research. The issue arises, for example, due to unpublished data and/or +source code and the sensitivity of ML training conditions. Although different +solutions have been proposed to address this issue, such as using ML platforms, +the level of reproducibility in ML-driven research remains unsatisfactory. +Therefore, in this article, we discuss the reproducibility of ML-driven +research with three main aims: (i) identifying the barriers to reproducibility +when applying ML in research as well as categorize the barriers to different +types of reproducibility (description, code, data, and experiment +reproducibility), (ii) discussing potential drivers such as tools, practices, +and interventions that support ML reproducibility, as well as distinguish +between technology-driven drivers, procedural drivers, and drivers related to +awareness and education, and (iii) mapping the drivers to the barriers. With +this work, we hope to provide insights and to contribute to the decision-making +process regarding the adoption of different solutions to support ML +reproducibility. + +
+
+ comment: Pre-print of submission for the AI Magazine - comments to this + pre-print are very welcome +
+
+
+
+
+ + ♻ ☆ Dynamic Q&A of Clinical Documents with Large Language Models + + +
+ Electronic health records (EHRs) house crucial patient data in clinical +notes. As these notes grow in volume and complexity, manual extraction becomes +challenging. This work introduces a natural language interface using large +language models (LLMs) for dynamic question-answering on clinical notes. Our +chatbot, powered by Langchain and transformer-based LLMs, allows users to query +in natural language, receiving relevant answers from clinical notes. +Experiments, utilizing various embedding models and advanced LLMs, show Wizard +Vicuna's superior accuracy, albeit with high compute demands. Model +optimization, including weight quantization, improves latency by approximately +48 times. Promising results indicate potential, yet challenges such as model +hallucinations and limited diverse medical case evaluations remain. Addressing +these gaps is crucial for unlocking the value in clinical notes and advancing +AI-driven clinical decision-making. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ ColPali: Efficient Document Retrieval with Vision Language Models + + +
+ Documents are visually rich structures that convey information through text, +as well as tables, figures, page layouts, or fonts. While modern document +retrieval systems exhibit strong performance on query-to-text matching, they +struggle to exploit visual cues efficiently, hindering their performance on +practical document retrieval applications such as Retrieval Augmented +Generation. To benchmark current systems on visually rich document retrieval, +we introduce the Visual Document Retrieval Benchmark ViDoRe, composed of +various page-level retrieving tasks spanning multiple domains, languages, and +settings. The inherent shortcomings of modern systems motivate the introduction +of a new retrieval model architecture, ColPali, which leverages the document +understanding capabilities of recent Vision Language Models to produce +high-quality contextualized embeddings solely from images of document pages. +Combined with a late interaction matching mechanism, ColPali largely +outperforms modern document retrieval pipelines while being drastically faster +and end-to-end trainable. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ A Survey on Popularity Bias in Recommender Systems + + +
+ Recommender systems help people find relevant content in a personalized way. +One main promise of such systems is that they are able to increase the +visibility of items in the long tail, i.e., the lesser-known items in a +catalogue. Existing research, however, suggests that in many situations todays +recommendation algorithms instead exhibit a popularity bias, meaning that they +often focus on rather popular items in their recommendations. Such a bias may +not only lead to the limited value of the recommendations for consumers and +providers in the short run, but it may also cause undesired reinforcement +effects over time. In this paper, we discuss the potential reasons for +popularity bias and review existing approaches to detect, quantify and mitigate +popularity bias in recommender systems. Our survey, therefore, includes both an +overview of the computational metrics used in the literature as well as a +review of the main technical approaches to reduce the bias. Furthermore, we +critically discuss todays literature, where we observe that the research is +almost entirely based on computational experiments and on certain assumptions +regarding the practical effects of including long-tail items in the +recommendations. + +
+
+
+
+
+ + ♻ ☆ Cocktail: A Comprehensive Information Retrieval Benchmark with + LLM-Generated Documents Integration ACL 2024 + + +
+ The proliferation of Large Language Models (LLMs) has led to an influx of +AI-generated content (AIGC) on the internet, transforming the corpus of +Information Retrieval (IR) systems from solely human-written to a coexistence +with LLM-generated content. The impact of this surge in AIGC on IR systems +remains an open question, with the primary challenge being the lack of a +dedicated benchmark for researchers. In this paper, we introduce Cocktail, a +comprehensive benchmark tailored for evaluating IR models in this mixed-sourced +data landscape of the LLM era. Cocktail consists of 16 diverse datasets with +mixed human-written and LLM-generated corpora across various text retrieval +tasks and domains. Additionally, to avoid the potential bias from previously +included dataset information in LLMs, we also introduce an up-to-date dataset, +named NQ-UTD, with queries derived from recent events. Through conducting over +1,000 experiments to assess state-of-the-art retrieval models against the +benchmarked datasets in Cocktail, we uncover a clear trade-off between ranking +performance and source bias in neural retrieval models, highlighting the +necessity for a balanced approach in designing future IR systems. We hope +Cocktail can serve as a foundational resource for IR research in the LLM era, +with all data and code publicly available at +\url{https://github.com/KID-22/Cocktail}. + +
+
+ comment: Accepted by Findings of ACL 2024; Datasets Link: + https://huggingface.co/IR-Cocktail +
+
+
+
+
+ + ♻ ☆ An Interpretable Alternative to Neural Representation Learning for + Rating Prediction -- Transparent Latent Class Modeling of User Reviews + + +
+ Nowadays, neural network (NN) and deep learning (DL) techniques are widely +adopted in many applications, including recommender systems. Given the sparse +and stochastic nature of collaborative filtering (CF) data, recent works have +critically analyzed the effective improvement of neural-based approaches +compared to simpler and often transparent algorithms for recommendation. +Previous results showed that NN and DL models can be outperformed by +traditional algorithms in many tasks. Moreover, given the largely black-box +nature of neural-based methods, interpretable results are not naturally +obtained. Following on this debate, we first present a transparent +probabilistic model that topologically organizes user and product latent +classes based on the review information. In contrast to popular neural +techniques for representation learning, we readily obtain a statistical, +visualization-friendly tool that can be easily inspected to understand user and +product characteristics from a textual-based perspective. Then, given the +limitations of common embedding techniques, we investigate the possibility of +using the estimated interpretable quantities as model input for a rating +prediction task. To contribute to the recent debates, we evaluate our results +in terms of both capacity for interpretability and predictive performances in +comparison with popular text-based neural approaches. The results demonstrate +that the proposed latent class representations can yield competitive predictive +performances, compared to popular, but difficult-to-interpret approaches. + +
+
+
+
+
+ + ♻ ☆ Light-weight End-to-End Graph Interest Network for CTR Prediction in + E-commerce Search + + +
+ Click-through-rate (CTR) prediction has an essential impact on improving user +experience and revenue in e-commerce search. With the development of deep +learning, graph-based methods are well exploited to utilize graph structure +extracted from user behaviors and other information to help embedding learning. +However, most of the previous graph-based methods mainly focus on +recommendation scenarios, and therefore their graph structures highly depend on +item's sequential information from user behaviors, ignoring query's sequential +signal and query-item correlation. In this paper, we propose a new approach +named Light-weight End-to-End Graph Interest Network (EGIN) to effectively mine +users' search interests and tackle previous challenges. (i) EGIN utilizes query +and item's correlation and sequential information from the search system to +build a heterogeneous graph for better CTR prediction in e-commerce search. +(ii) EGIN's graph embedding learning shares the same training input and is +jointly trained with CTR prediction, making the end-to-end framework effortless +to deploy in large-scale search systems. The proposed EGIN is composed of three +parts: query-item heterogeneous graph, light-weight graph sampling, and +multi-interest network. The query-item heterogeneous graph captures correlation +and sequential information of query and item efficiently by the proposed +light-weight graph sampling. The multi-interest network is well designed to +utilize graph embedding to capture various similarity relationships between +query and item to enhance the final CTR prediction. We conduct extensive +experiments on both public and industrial datasets to demonstrate the +effectiveness of the proposed EGIN. At the same time, the training cost of +graph learning is relatively low compared with the main CTR prediction task, +ensuring efficiency in practical applications. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Differentially Private Graph Diffusion with Applications in Personalized + PageRanks + + +
+ Graph diffusion, which iteratively propagates real-valued substances among +the graph, is used in numerous graph/network-involved applications. However, +releasing diffusion vectors may reveal sensitive linking information in the +data such as transaction information in financial network data. However, +protecting the privacy of graph data is challenging due to its interconnected +nature. This work proposes a novel graph diffusion framework with edge-level +differential privacy guarantees by using noisy diffusion iterates. The +algorithm injects Laplace noise per diffusion iteration and adopts a +degree-based thresholding function to mitigate the high sensitivity induced by +low-degree nodes. Our privacy loss analysis is based on Privacy Amplification +by Iteration (PABI), which to our best knowledge, is the first effort that +analyzes PABI with Laplace noise and provides relevant applications. We also +introduce a novel Infinity-Wasserstein distance tracking method, which tightens +the analysis of privacy leakage and makes PABI more applicable in practice. We +evaluate this framework by applying it to Personalized Pagerank computation for +ranking tasks. Experiments on real-world network data demonstrate the +superiority of our method under stringent privacy conditions. + +
+
+
+
+
+
+
+
+ + Machine Learning 147 + +
+
+
+ + ☆ MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via + Dynamic Sparse Attention + + +
+ The computational challenges of Large Language Model (LLM) inference remain a +significant barrier to their widespread deployment, especially as prompt +lengths continue to increase. Due to the quadratic complexity of the attention +computation, it takes 30 minutes for an 8B LLM to process a prompt of 1M tokens +(i.e., the pre-filling stage) on a single A100 GPU. Existing methods for +speeding up prefilling often fail to maintain acceptable accuracy or efficiency +when applied to long-context LLMs. To address this gap, we introduce MInference +(Milliontokens Inference), a sparse calculation method designed to accelerate +pre-filling of long-sequence processing. Specifically, we identify three unique +patterns in long-context attention matrices-the A-shape, Vertical-Slash, and +Block-Sparsethat can be leveraged for efficient sparse computation on GPUs. We +determine the optimal pattern for each attention head offline and dynamically +build sparse indices based on the assigned pattern during inference. With the +pattern and sparse indices, we perform efficient sparse attention calculations +via our optimized GPU kernels to significantly reduce the latency in the +pre-filling stage of long-context LLMs. Our proposed technique can be directly +applied to existing LLMs without any modifications to the pre-training setup or +additional fine-tuning. By evaluating on a wide range of downstream tasks, +including InfiniteBench, RULER, PG-19, and Needle In A Haystack, and models +including LLaMA-3-1M, GLM4-1M, Yi-200K, Phi-3-128K, and Qwen2-128K, we +demonstrate that MInference effectively reduces inference latency by up to 10x +for pre-filling on an A100, while maintaining accuracy. Our code is available +at https://aka.ms/MInference. + +
+
+
+
+
+ + ☆ Magic Insert: Style-Aware Drag-and-Drop + + +
+ We present Magic Insert, a method for dragging-and-dropping subjects from a +user-provided image into a target image of a different style in a physically +plausible manner while matching the style of the target image. This work +formalizes the problem of style-aware drag-and-drop and presents a method for +tackling it by addressing two sub-problems: style-aware personalization and +realistic object insertion in stylized images. For style-aware personalization, +our method first fine-tunes a pretrained text-to-image diffusion model using +LoRA and learned text tokens on the subject image, and then infuses it with a +CLIP representation of the target style. For object insertion, we use +Bootstrapped Domain Adaption to adapt a domain-specific photorealistic object +insertion model to the domain of diverse artistic styles. Overall, the method +significantly outperforms traditional approaches such as inpainting. Finally, +we present a dataset, SubjectPlop, to facilitate evaluation and future progress +in this area. Project page: https://magicinsert.github.io/ + +
+
+ comment: Project page: https://magicinsert.github.io/ +
+
+
+
+
+ + ☆ Neurocache: Efficient Vector Retrieval for Long-range Language Modeling NAACL'24 + + +
+ This paper introduces Neurocache, an approach to extend the effective context +size of large language models (LLMs) using an external vector cache to store +its past states. Like recent vector retrieval approaches, Neurocache uses an +efficient k-nearest-neighbor (kNN) algorithm to retrieve relevant past states +and incorporate them into the attention process. Neurocache improves upon +previous methods by (1) storing compressed states, which reduces cache size; +(2) performing a single retrieval operation per token which increases inference +speed; and (3) extending the retrieval window to neighboring states, which +improves both language modeling and downstream task accuracy. Our experiments +show the effectiveness of Neurocache both for models trained from scratch and +for pre-trained models such as Llama2-7B and Mistral-7B when enhanced with the +cache mechanism. We also compare Neurocache with text retrieval methods and +show improvements in single-document question-answering and few-shot learning +tasks. We made the source code available under: +https://github.com/alisafaya/neurocache + +
+
+ comment: Long paper, published at the main conference NAACL'24 +
+
+
+
+
+ + ☆ RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in + LLMs + + +
+ Large language models (LLMs) typically utilize the top-k contexts from a +retriever in retrieval-augmented generation (RAG). In this work, we propose a +novel instruction fine-tuning framework RankRAG, which instruction-tunes a +single LLM for the dual purpose of context ranking and answer generation in +RAG. In particular, the instruction-tuned LLMs work surprisingly well by adding +a small fraction of ranking data into the training blend, and outperform +existing expert ranking models, including the same LLM exclusively fine-tuned +on a large amount of ranking data. For generation, we compare our model with +many strong baselines, including GPT-4-0613, GPT-4-turbo-2024-0409, and +ChatQA-1.5, an open-sourced model with the state-of-the-art performance on RAG +benchmarks. Specifically, our Llama3-RankRAG significantly outperforms +Llama3-ChatQA-1.5 and GPT-4 models on nine knowledge-intensive benchmarks. In +addition, it also performs comparably to GPT-4 on five RAG benchmarks in the +biomedical domain without instruction fine-tuning on biomedical data, +demonstrating its superb capability for generalization to new domains. + +
+
+
+
+
+ + ☆ Scalable Multi-Output Gaussian Processes with Stochastic Variational + Inference + + +
+ The Multi-Output Gaussian Process is is a popular tool for modelling data +from multiple sources. A typical choice to build a covariance function for a +MOGP is the Linear Model of Coregionalization (LMC) which parametrically models +the covariance between outputs. The Latent Variable MOGP (LV-MOGP) generalises +this idea by modelling the covariance between outputs using a kernel applied to +latent variables, one per output, leading to a flexible MOGP model that allows +efficient generalization to new outputs with few data points. Computational +complexity in LV-MOGP grows linearly with the number of outputs, which makes it +unsuitable for problems with a large number of outputs. In this paper, we +propose a stochastic variational inference approach for the LV-MOGP that allows +mini-batches for both inputs and outputs, making computational complexity per +training iteration independent of the number of outputs. + +
+
+ comment: none +
+
+
+
+
+ + ☆ PWM: Policy Learning with Large World Models + + +
+ Reinforcement Learning (RL) has achieved impressive results on complex tasks +but struggles in multi-task settings with different embodiments. World models +offer scalability by learning a simulation of the environment, yet they often +rely on inefficient gradient-free optimization methods. We introduce Policy +learning with large World Models (PWM), a novel model-based RL algorithm that +learns continuous control policies from large multi-task world models. By +pre-training the world model on offline data and using it for first-order +gradient policy learning, PWM effectively solves tasks with up to 152 action +dimensions and outperforms methods using ground-truth dynamics. Additionally, +PWM scales to an 80-task setting, achieving up to 27% higher rewards than +existing baselines without the need for expensive online planning. +Visualizations and code available at https://policy-world-model.github.io + +
+
+ comment: Visualizations and code available at + https://policy-world-model.github.io +
+
+
+
+
+ + ☆ Decentralized Intelligence Network (DIN) + + +
+ Decentralized Intelligence Network (DIN) addresses the significant challenges +of data sovereignty and AI utilization caused by the fragmentation and siloing +of data across providers and institutions. This comprehensive framework +overcomes access barriers to scalable data sources previously hindered by silos +by leveraging: 1) personal data stores as a prerequisite for data sovereignty; +2) a scalable federated learning protocol implemented on a public blockchain +for decentralized AI training, where data remains with participants and only +model parameter updates are shared; and 3) a scalable, trustless rewards +mechanism to incentivize participation and ensure fair reward distribution. +This framework ensures that no entity can prevent or control access to training +on data offered by participants or determine financial benefits, as these +processes operate on a public blockchain with an immutable record and without a +third party. It supports effective AI training, allowing participants to +maintain control over their data, benefit financially, and contribute to a +decentralized, scalable ecosystem that leverages collective AI to develop +beneficial algorithms. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ☆ PLeaS -- Merging Models with Permutations and Least Squares + + +
+ The democratization of machine learning systems has made the process of +fine-tuning accessible to a large number of practitioners, leading to a wide +range of open-source models fine-tuned on specialized tasks and datasets. +Recent work has proposed to merge such models to combine their functionalities. +However, prior approaches are restricted to models that are fine-tuned from the +same base model. Furthermore, the final merged model is typically restricted to +be of the same size as the original models. In this work, we propose a new +two-step algorithm to merge models-termed PLeaS-which relaxes these +constraints. First, leveraging the Permutation symmetries inherent in the two +models, PLeaS partially matches nodes in each layer by maximizing alignment. +Next, PLeaS computes the weights of the merged model as a layer-wise Least +Squares solution to minimize the approximation error between the features of +the merged model and the permuted features of the original models. into a +single model of a desired size, even when the two original models are +fine-tuned from different base models. We also present a variant of our method +which can merge models without using data from the fine-tuning domains. We +demonstrate our method to merge ResNet models trained with shared and different +label spaces, and show that we can perform better than the state-of-the-art +merging methods by 8 to 15 percentage points for the same target compute while +merging models trained on DomainNet and on fine-grained classification tasks. + +
+
+
+
+
+ + ☆ Parameter Matching Attack: Enhancing Practical Applicability of + Availability Attacks + + +
+ The widespread use of personal data for training machine learning models +raises significant privacy concerns, as individuals have limited control over +how their public data is subsequently utilized. Availability attacks have +emerged as a means for data owners to safeguard their data by desning +imperceptible perturbations that degrade model performance when incorporated +into training datasets. However, existing availability attacks exhibit +limitations in practical applicability, particularly when only a portion of the +data can be perturbed. To address this challenge, we propose a novel +availability attack approach termed Parameter Matching Attack (PMA). PMA is the +first availability attack that works when only a portion of data can be +perturbed. PMA optimizes perturbations so that when the model is trained on a +mixture of clean and perturbed data, the resulting model will approach a model +designed to perform poorly. Experimental results across four datasets +demonstrate that PMA outperforms existing methods, achieving significant model +performance degradation when a part of the training data is perturbed. Our code +is available in the supplementary. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Evaluating the Robustness of Adverse Drug Event Classification Models + Using Templates ACL + + +
+ An adverse drug effect (ADE) is any harmful event resulting from medical drug +treatment. Despite their importance, ADEs are often under-reported in official +channels. Some research has therefore turned to detecting discussions of ADEs +in social media. Impressive results have been achieved in various attempts to +detect ADEs. In a high-stakes domain such as medicine, however, an in-depth +evaluation of a model's abilities is crucial. We address the issue of thorough +performance evaluation in English-language ADE detection with hand-crafted +templates for four capabilities: Temporal order, negation, sentiment, and +beneficial effect. We find that models with similar performance on held-out +test sets have varying results on these capabilities. + +
+
+ comment: Accepted at BioNLP 2024 and Shared Tasks (ACL Workshop) +
+
+
+
+
+ + ☆ On the Robustness of Graph Reduction Against GNN Backdoor + + +
+ Graph Neural Networks (GNNs) are gaining popularity across various domains +due to their effectiveness in learning graph-structured data. Nevertheless, +they have been shown to be susceptible to backdoor poisoning attacks, which +pose serious threats to real-world applications. Meanwhile, graph reduction +techniques, including coarsening and sparsification, which have long been +employed to improve the scalability of large graph computational tasks, have +recently emerged as effective methods for accelerating GNN training on +large-scale graphs. However, the current development and deployment of graph +reduction techniques for large graphs overlook the potential risks of data +poisoning attacks against GNNs. It is not yet clear how graph reduction +interacts with existing backdoor attacks. This paper conducts a thorough +examination of the robustness of graph reduction methods in scalable GNN +training in the presence of state-of-the-art backdoor attacks. We performed a +comprehensive robustness analysis across six coarsening methods and six +sparsification methods for graph reduction, under three GNN backdoor attacks +against three GNN architectures. Our findings indicate that the effectiveness +of graph reduction methods in mitigating attack success rates varies +significantly, with some methods even exacerbating the attacks. Through +detailed analyses of triggers and poisoned nodes, we interpret our findings and +enhance our understanding of how graph reduction interacts with backdoor +attacks. These results highlight the critical need for incorporating robustness +considerations in graph reduction for GNN training, ensuring that enhancements +in computational efficiency do not compromise the security of GNN systems. + +
+
+
+
+
+ + ☆ Meta 3D TextureGen: Fast and Consistent Texture Generation for 3D + Objects + + +
+ The recent availability and adaptability of text-to-image models has sparked +a new era in many related domains that benefit from the learned text priors as +well as high-quality and fast generation capabilities, one of which is texture +generation for 3D objects. Although recent texture generation methods achieve +impressive results by using text-to-image networks, the combination of global +consistency, quality, and speed, which is crucial for advancing texture +generation to real-world applications, remains elusive. To that end, we +introduce Meta 3D TextureGen: a new feedforward method comprised of two +sequential networks aimed at generating high-quality and globally consistent +textures for arbitrary geometries of any complexity degree in less than 20 +seconds. Our method achieves state-of-the-art results in quality and speed by +conditioning a text-to-image model on 3D semantics in 2D space and fusing them +into a complete and high-resolution UV texture map, as demonstrated by +extensive qualitative and quantitative evaluations. In addition, we introduce a +texture enhancement network that is capable of up-scaling any texture by an +arbitrary ratio, producing 4k pixel resolution textures. + +
+
+
+
+
+ + ☆ A Pattern Language for Machine Learning Tasks + + +
+ Idealised as universal approximators, learners such as neural networks can be +viewed as "variable functions" that may become one of a range of concrete +functions after training. In the same way that equations constrain the possible +values of variables in algebra, we may view objective functions as constraints +on the behaviour of learners. We extract the equivalences perfectly optimised +objective functions impose, calling them "tasks". For these tasks, we develop a +formal graphical language that allows us to: (1) separate the core tasks of a +behaviour from its implementation details; (2) reason about and design +behaviours model-agnostically; and (3) simply describe and unify approaches in +machine learning across domains. + As proof-of-concept, we design a novel task that enables converting +classifiers into generative models we call "manipulators", which we implement +by directly translating task specifications into code. The resulting models +exhibit capabilities such as style transfer and interpretable latent-space +editing, without the need for custom architectures, adversarial training or +random sampling. We formally relate the behaviour of manipulators to GANs, and +empirically demonstrate their competitive performance with VAEs. We report on +experiments across vision and language domains aiming to characterise +manipulators as approximate Bayesian inversions of discriminative classifiers. + +
+
+
+
+
+ + ☆ On the Anatomy of Attention + + +
+ We introduce a category-theoretic diagrammatic formalism in order to +systematically relate and reason about machine learning models. Our diagrams +present architectures intuitively but without loss of essential detail, where +natural relationships between models are captured by graphical transformations, +and important differences and similarities can be identified at a glance. In +this paper, we focus on attention mechanisms: translating folklore into +mathematical derivations, and constructing a taxonomy of attention variants in +the literature. As a first example of an empirical investigation underpinned by +our formalism, we identify recurring anatomical components of attention, which +we exhaustively recombine to explore a space of variations on the attention +mechanism. + +
+
+
+
+
+ + ☆ Quantum Curriculum Learning + + +
+ Quantum machine learning (QML) requires significant quantum resources to +achieve quantum advantage. Research should prioritize both the efficient design +of quantum architectures and the development of learning strategies to optimize +resource usage. We propose a framework called quantum curriculum learning +(Q-CurL) for quantum data, where the curriculum introduces simpler tasks or +data to the learning model before progressing to more challenging ones. We +define the curriculum criteria based on the data density ratio between tasks to +determine the curriculum order. We also implement a dynamic learning schedule +to emphasize the significance of quantum data in optimizing the loss function. +Empirical evidence shows that Q-CurL enhances the training convergence and the +generalization for unitary learning tasks and improves the robustness of +quantum phase recognition tasks. Our framework provides a general learning +strategy, bringing QML closer to realizing practical advantages. + +
+
+ comment: main 5 pages, supplementary materials 6 pages +
+
+
+
+
+ + ☆ CEB: Compositional Evaluation Benchmark for Fairness in Large Language + Models + + +
+ As Large Language Models (LLMs) are increasingly deployed to handle various +natural language processing (NLP) tasks, concerns regarding the potential +negative societal impacts of LLM-generated content have also arisen. To +evaluate the biases exhibited by LLMs, researchers have recently proposed a +variety of datasets. However, existing bias evaluation efforts often focus on +only a particular type of bias and employ inconsistent evaluation metrics, +leading to difficulties in comparison across different datasets and LLMs. To +address these limitations, we collect a variety of datasets designed for the +bias evaluation of LLMs, and further propose CEB, a Compositional Evaluation +Benchmark that covers different types of bias across different social groups +and tasks. The curation of CEB is based on our newly proposed compositional +taxonomy, which characterizes each dataset from three dimensions: bias types, +social groups, and tasks. By combining the three dimensions, we develop a +comprehensive evaluation strategy for the bias in LLMs. Our experiments +demonstrate that the levels of bias vary across these dimensions, thereby +providing guidance for the development of specific bias mitigation methods. + +
+
+ comment: 37 pages, 32 figures +
+
+
+
+
+ + ☆ Tiny-PULP-Dronets: Squeezing Neural Networks for Faster and Lighter + Inference on Multi-Tasking Autonomous Nano-Drones + + +
+ Pocket-sized autonomous nano-drones can revolutionize many robotic use cases, +such as visual inspection in narrow, constrained spaces, and ensure safer +human-robot interaction due to their tiny form factor and weight -- i.e., tens +of grams. This compelling vision is challenged by the high level of +intelligence needed aboard, which clashes against the limited computational and +storage resources available on PULP (parallel-ultra-low-power) MCU class +navigation and mission controllers that can be hosted aboard. This work moves +from PULP-Dronet, a State-of-the-Art convolutional neural network for +autonomous navigation on nano-drones. We introduce Tiny-PULP-Dronet: a novel +methodology to squeeze by more than one order of magnitude model size (50x +fewer parameters), and number of operations (27x less multiply-and-accumulate) +required to run inference with similar flight performance as PULP-Dronet. This +massive reduction paves the way towards affordable multi-tasking on +nano-drones, a fundamental requirement for achieving high-level intelligence. + +
+
+ comment: 3 Figures, 1 table. Accepted for publication at IEEE Artificial + Intelligence Circuits and Systems (AICAS), 2022 +
+
+
+
+
+ + ☆ Uncertainty-Aware Decarbonization for Datacenters + + +
+ This paper represents the first effort to quantify uncertainty in carbon +intensity forecasting for datacenter decarbonization. We identify and analyze +two types of uncertainty -- temporal and spatial -- and discuss their system +implications. To address the temporal dynamics in quantifying uncertainty for +carbon intensity forecasting, we introduce a conformal prediction-based +framework. Evaluation results show that our technique robustly achieves target +coverages in uncertainty quantification across various significance levels. We +conduct two case studies using production power traces, focusing on temporal +and spatial load shifting respectively. The results show that incorporating +uncertainty into scheduling decisions can prevent a 5% and 14% increase in +carbon emissions, respectively. These percentages translate to an absolute +reduction of 2.1 and 10.4 tons of carbon emissions in a 20 MW datacenter +cluster. + +
+
+
+
+
+ + ☆ SafaRi:Adaptive Sequence Transformer for Weakly Supervised Referring + Expression Segmentation ECCV 2024 + + +
+ Referring Expression Segmentation (RES) aims to provide a segmentation mask +of the target object in an image referred to by the text (i.e., referring +expression). Existing methods require large-scale mask annotations. Moreover, +such approaches do not generalize well to unseen/zero-shot scenarios. To +address the aforementioned issues, we propose a weakly-supervised bootstrapping +architecture for RES with several new algorithmic innovations. To the best of +our knowledge, ours is the first approach that considers only a fraction of +both mask and box annotations (shown in Figure 1 and Table 1) for training. To +enable principled training of models in such low-annotation settings, improve +image-text region-level alignment, and further enhance spatial localization of +the target object in the image, we propose Cross-modal Fusion with Attention +Consistency module. For automatic pseudo-labeling of unlabeled samples, we +introduce a novel Mask Validity Filtering routine based on a spatially aware +zero-shot proposal scoring approach. Extensive experiments show that with just +30% annotations, our model SafaRi achieves 59.31 and 48.26 mIoUs as compared to +58.93 and 48.19 mIoUs obtained by the fully-supervised SOTA method SeqTR +respectively on RefCOCO+@testA and RefCOCO+testB datasets. SafaRi also +outperforms SeqTR by 11.7% (on RefCOCO+testA) and 19.6% (on RefCOCO+testB) in a +fully-supervised setting and demonstrates strong generalization capabilities in +unseen/zero-shot tasks. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ Two-Step Q-Learning + + +
+ Q-learning is a stochastic approximation version of the classic value +iteration. The literature has established that Q-learning suffers from both +maximization bias and slower convergence. Recently, multi-step algorithms have +shown practical advantages over existing methods. This paper proposes a novel +off-policy two-step Q-learning algorithms, without importance sampling. With +suitable assumption it was shown that, iterates in the proposed two-step +Q-learning is bounded and converges almost surely to the optimal Q-values. This +study also address the convergence analysis of the smooth version of two-step +Q-learning, i.e., by replacing max function with the log-sum-exp function. The +proposed algorithms are robust and easy to implement. Finally, we test the +proposed algorithms on benchmark problems such as the roulette problem, +maximization bias problem, and randomly generated Markov decision processes and +compare it with the existing methods available in literature. Numerical +experiments demonstrate the superior performance of both the two-step +Q-learning and its smooth variants. + +
+
+
+
+
+ + ☆ Fast, Scalable, Energy-Efficient Non-element-wise Matrix Multiplication + on FPGA + + +
+ Modern Neural Network (NN) architectures heavily rely on vast numbers of +multiply-accumulate arithmetic operations, constituting the predominant +computational cost. Therefore, this paper proposes a high-throughput, scalable +and energy efficient non-element-wise matrix multiplication unit on FPGAs as a +basic component of the NNs. We firstly streamline inter-layer and intra-layer +redundancies of MADDNESS algorithm, a LUT-based approximate matrix +multiplication, to design a fast, efficient scalable approximate matrix +multiplication module termed "Approximate Multiplication Unit (AMU)". The AMU +optimizes LUT-based matrix multiplications further through dedicated memory +management and access design, decoupling computational overhead from input +resolution and boosting FPGA-based NN accelerator efficiency significantly. The +experimental results show that using our AMU achieves up to 9x higher +throughput and 112x higher energy efficiency over the state-of-the-art +solutions for the FPGA-based Quantised Neural Network (QNN) accelerators. + +
+
+
+
+
+ + ☆ Enable the Right to be Forgotten with Federated Client Unlearning in + Medical Imaging + + +
+ The right to be forgotten, as stated in most data regulations, poses an +underexplored challenge in federated learning (FL), leading to the development +of federated unlearning (FU). However, current FU approaches often face +trade-offs between efficiency, model performance, forgetting efficacy, and +privacy preservation. In this paper, we delve into the paradigm of Federated +Client Unlearning (FCU) to guarantee a client the right to erase the +contribution or the influence, introducing the first FU framework in medical +imaging. In the unlearning process of a client, the proposed model-contrastive +unlearning marks a pioneering step towards feature-level unlearning, and +frequency-guided memory preservation ensures smooth forgetting of local +knowledge while maintaining the generalizability of the trained global model, +thus avoiding performance compromises and guaranteeing rapid post-training. We +evaluated our FCU framework on two public medical image datasets, including +Intracranial hemorrhage diagnosis and skin lesion diagnosis, demonstrating that +our framework outperformed other state-of-the-art FU frameworks, with an +expected speed-up of 10-15 times compared with retraining from scratch. The +code and the organized datasets can be found at: +https://github.com/dzp2095/FCU. + +
+
+
+
+
+ + ☆ Revisiting Cascaded Ensembles for Efficient Inference ICML 2024 + + +
+ A common approach to make machine learning inference more efficient is to use +example-specific adaptive schemes, which route or select models for each +example at inference time. In this work we study a simple scheme for adaptive +inference. We build a cascade of ensembles (CoE), beginning with +resource-efficient models and growing to larger, more expressive models, where +ensemble agreement serves as a data-dependent routing criterion. This scheme is +easy to incorporate into existing inference pipelines, requires no additional +training, and can be used to place models across multiple resource tiers--for +instance, serving efficient models at the edge and invoking larger models in +the cloud only when necessary. In cases where parallel inference is feasible, +we show that CoE can improve accuracy relative to the single best model while +reducing the average cost of inference by up to 7x, and provides +Pareto-dominate solutions in accuracy and efficiency relative to existing +adaptive inference baselines. These savings translate to an over 3x-reduction +in total monetary cost when performing inference using a heterogeneous cluster +of GPUs. Finally, for edge inference scenarios where portions of the cascade +reside at the edge vs. in the cloud, CoE can provide a 14x reduction in +communication cost and inference latency without sacrificing accuracy. + +
+
+ comment: ES-FOMO, ICML 2024 +
+
+
+
+
+ + ☆ CALICO: Confident Active Learning with Integrated Calibration ICANN2024 + + +
+ The growing use of deep learning in safety-critical applications, such as +medical imaging, has raised concerns about limited labeled data, where this +demand is amplified as model complexity increases, posing hurdles for domain +experts to annotate data. In response to this, active learning (AL) is used to +efficiently train models with limited annotation costs. In the context of deep +neural networks (DNNs), AL often uses confidence or probability outputs as a +score for selecting the most informative samples. However, modern DNNs exhibit +unreliable confidence outputs, making calibration essential. We propose an AL +framework that self-calibrates the confidence used for sample selection during +the training process, referred to as Confident Active Learning with Integrated +CalibratiOn (CALICO). CALICO incorporates the joint training of a classifier +and an energy-based model, instead of the standard softmax-based classifier. +This approach allows for simultaneous estimation of the input data distribution +and the class probabilities during training, improving calibration without +needing an additional labeled dataset. Experimental results showcase improved +classification performance compared to a softmax-based classifier with fewer +labeled samples. Furthermore, the calibration stability of the model is +observed to depend on the prior class distribution of the data. + +
+
+ comment: Accepted to ICANN2024 +
+
+
+
+
+ + ☆ QSync: Quantization-Minimized Synchronous Distributed Training Across + Hybrid Devices + + +
+ A number of production deep learning clusters have attempted to explore +inference hardware for DNN training, at the off-peak serving hours with many +inference GPUs idling. Conducting DNN training with a combination of +heterogeneous training and inference GPUs, known as hybrid device training, +presents considerable challenges due to disparities in compute capability and +significant differences in memory capacity. We propose QSync, a training system +that enables efficient synchronous data-parallel DNN training over hybrid +devices by strategically exploiting quantized operators. According to each +device's available resource capacity, QSync selects a quantization-minimized +setting for operators in the distributed DNN training graph, minimizing model +accuracy degradation but keeping the training efficiency brought by +quantization. We carefully design a predictor with a bi-directional +mixed-precision indicator to reflect the sensitivity of DNN layers on +fixed-point and floating-point low-precision operators, a replayer with a +neighborhood-aware cost mapper to accurately estimate the latency of +distributed hybrid mixed-precision training, and then an allocator that +efficiently synchronizes workers with minimized model accuracy degradation. +QSync bridges the computational graph on PyTorch to an optimized backend for +quantization kernel performance and flexible support for various GPU +architectures. Extensive experiments show that QSync's predictor can accurately +simulate distributed mixed-precision training with <5% error, with a consistent +0.27-1.03% accuracy improvement over the from-scratch training tasks compared +to uniform precision. + +
+
+ comment: IPDPS 24 +
+
+
+
+
+ + ☆ Stochastic Differential Equations models for Least-Squares Stochastic + Gradient Descent + + +
+ We study the dynamics of a continuous-time model of the Stochastic Gradient +Descent (SGD) for the least-square problem. Indeed, pursuing the work of Li et +al. (2019), we analyze Stochastic Differential Equations (SDEs) that model SGD +either in the case of the training loss (finite samples) or the population one +(online setting). A key qualitative feature of the dynamics is the existence of +a perfect interpolator of the data, irrespective of the sample size. In both +scenarios, we provide precise, non-asymptotic rates of convergence to the +(possibly degenerate) stationary distribution. Additionally, we describe this +asymptotic distribution, offering estimates of its mean, deviations from it, +and a proof of the emergence of heavy-tails related to the step-size magnitude. +Numerical simulations supporting our findings are also presented. + +
+
+
+
+
+ + ☆ Semantically Guided Representation Learning For Action Anticipation ECCV'24 + + +
+ Action anticipation is the task of forecasting future activity from a +partially observed sequence of events. However, this task is exposed to +intrinsic future uncertainty and the difficulty of reasoning upon +interconnected actions. Unlike previous works that focus on extrapolating +better visual and temporal information, we concentrate on learning action +representations that are aware of their semantic interconnectivity based on +prototypical action patterns and contextual co-occurrences. To this end, we +propose the novel Semantically Guided Representation Learning (S-GEAR) +framework. S-GEAR learns visual action prototypes and leverages language models +to structure their relationship, inducing semanticity. To gather insights on +S-GEAR's effectiveness, we test it on four action anticipation benchmarks, +obtaining improved results compared to previous works: +3.5, +2.7, and +3.5 +absolute points on Top-1 Accuracy on Epic-Kitchen 55, EGTEA Gaze+ and 50 +Salads, respectively, and +0.8 on Top-5 Recall on Epic-Kitchens 100. We further +observe that S-GEAR effectively transfers the geometric associations between +actions from language to visual prototypes. Finally, S-GEAR opens new research +frontiers in anticipation tasks by demonstrating the intricate impact of action +semantic interconnectivity. + +
+
+ comment: Accepted as a full paper at ECCV'24 with Paper ID #4140 +
+
+
+
+
+ + ☆ How to Boost Any Loss Function + + +
+ Boosting is a highly successful ML-born optimization setting in which one is +required to computationally efficiently learn arbitrarily good models based on +the access to a weak learner oracle, providing classifiers performing at least +slightly differently from random guessing. A key difference with gradient-based +optimization is that boosting's original model does not requires access to +first order information about a loss, yet the decades long history of boosting +has quickly evolved it into a first order optimization setting -- sometimes +even wrongfully \textit{defining} it as such. Owing to recent progress +extending gradient-based optimization to use only a loss' zeroth ($0^{th}$) +order information to learn, this begs the question: what loss functions can be +efficiently optimized with boosting and what is the information really needed +for boosting to meet the \textit{original} boosting blueprint's requirements? + We provide a constructive formal answer essentially showing that \textit{any} +loss function can be optimized with boosting and thus boosting can achieve a +feat not yet known to be possible in the classical $0^{th}$ order setting, +since loss functions are not required to be be convex, nor differentiable or +Lipschitz -- and in fact not required to be continuous either. Some tools we +use are rooted in quantum calculus, the mathematical field -- not to be +confounded with quantum computation -- that studies calculus without passing to +the limit, and thus without using first order information. + +
+
+
+
+
+ + ☆ Learning Paradigms and Modelling Methodologies for Digital Twins in + Process Industry + + +
+ Central to the digital transformation of the process industry are Digital +Twins (DTs), virtual replicas of physical manufacturing systems that combine +sensor data with sophisticated data-based or physics-based models, or a +combination thereof, to tackle a variety of industrial-relevant tasks like +process monitoring, predictive control or decision support. The backbone of a +DT, i.e. the concrete modelling methodologies and architectural frameworks +supporting these models, are complex, diverse and evolve fast, necessitating a +thorough understanding of the latest state-of-the-art methods and trends to +stay on top of a highly competitive market. From a research perspective, +despite the high research interest in reviewing various aspects of DTs, +structured literature reports specifically focusing on unravelling the utilized +learning paradigms (e.g. self-supervised learning) for DT-creation in the +process industry are a novel contribution in this field. This study aims to +address these gaps by (1) systematically analyzing the modelling methodologies +(e.g. Convolutional Neural Network, Encoder-Decoder, Hidden Markov Model) and +paradigms (e.g. data-driven, physics-based, hybrid) used for DT-creation; (2) +assessing the utilized learning strategies (e.g. supervised, unsupervised, +self-supervised); (3) analyzing the type of modelling task (e.g. regression, +classification, clustering); and (4) identifying the challenges and research +gaps, as well as, discuss potential resolutions provided. + +
+
+
+
+
+ + ☆ Improving Explainability of Softmax Classifiers Using a Prototype-Based + Joint Embedding Method + + +
+ We propose a prototype-based approach for improving explainability of softmax +classifiers that provides an understandable prediction confidence, generated +through stochastic sampling of prototypes, and demonstrates potential for out +of distribution detection (OOD). By modifying the model architecture and +training to make predictions using similarities to any set of class examples +from the training dataset, we acquire the ability to sample for prototypical +examples that contributed to the prediction, which provide an instance-based +explanation for the model's decision. Furthermore, by learning relationships +between images from the training dataset through relative distances within the +model's latent space, we obtain a metric for uncertainty that is better able to +detect out of distribution data than softmax confidence. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ IFTT-PIN: A Self-Calibrating PIN-Entry Method + + +
+ Personalising an interface to the needs and preferences of a user often +incurs additional interaction steps. In this paper, we demonstrate a novel +method that enables the personalising of an interface without the need for +explicit calibration procedures, via a process we call self-calibration. A +second-order effect of self-calibration is that an outside observer cannot +easily infer what a user is trying to achieve because they cannot interpret the +user's actions. To explore this security angle, we developed IFTT-PIN (If This +Then PIN) as the first self-calibrating PIN-entry method. When using IFTT-PIN, +users are free to choose any button for any meaning without ever explicitly +communicating their choice to the machine. IFTT-PIN infers both the user's PIN +and their preferred button mapping at the same time. This paper presents the +concept, implementation, and interactive demonstrations of IFTT-PIN, as well as +an evaluation against shoulder surfing attacks. Our study (N=24) shows that by +adding self-calibration to an existing PIN entry method, IFTT-PIN statistically +significantly decreased PIN attack decoding rate by ca. 8.5 times (p=1.1e-9), +while only decreasing the PIN entry encoding rate by ca. 1.4 times (p=0.02), +leading to a positive security-usability trade-off. IFTT-PIN's entry rate +significantly improved 21 days after first exposure (p=3.6e-6) to the method, +suggesting self-calibrating interfaces are memorable despite using an initially +undefined user interface. Self-calibration methods might lead to novel +opportunities for interaction that are more inclusive and versatile, a +potentially interesting challenge for the community. A short introductory video +is available at https://youtu.be/pP5sfniNRns. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2205.09534 +
+
+
+
+
+ + ☆ DrugCLIP: Contrastive Drug-Disease Interaction For Drug Repurposing + + +
+ Bringing a novel drug from the original idea to market typically requires +more than ten years and billions of dollars. To alleviate the heavy burden, a +natural idea is to reuse the approved drug to treat new diseases. The process +is also known as drug repurposing or drug repositioning. Machine learning +methods exhibited huge potential in automating drug repurposing. However, it +still encounter some challenges, such as lack of labels and multimodal feature +representation. To address these issues, we design DrugCLIP, a cutting-edge +contrastive learning method, to learn drug and disease's interaction without +negative labels. Additionally, we have curated a drug repurposing dataset based +on real-world clinical trial records. Thorough empirical studies are conducted +to validate the effectiveness of the proposed DrugCLIP method. + +
+
+
+
+
+ + ☆ FreeCG: Free the Design Space of Clebsch-Gordan Transform for machine + learning force field + + +
+ The Clebsch-Gordan Transform (CG transform) effectively encodes many-body +interactions. Many studies have proven its accuracy in depicting atomic +environments, although this comes with high computational needs. The +computational burden of this challenge is hard to reduce due to the need for +permutation equivariance, which limits the design space of the CG transform +layer. We show that, implementing the CG transform layer on +permutation-invariant inputs allows complete freedom in the design of this +layer without affecting symmetry. Developing further on this premise, our idea +is to create a CG transform layer that operates on permutation-invariant +abstract edges generated from real edge information. We bring in group CG +transform with sparse path, abstract edges shuffling, and attention enhancer to +form a powerful and efficient CG transform layer. Our method, known as FreeCG, +achieves State-of-The-Art (SoTA) results in force prediction for MD17, rMD17, +MD22, and property prediction in QM9 datasets with notable enhancement. It +introduces a novel paradigm for carrying out efficient and expressive CG +transform in future geometric neural network designs. + +
+
+
+
+
+ + ☆ SiamTST: A Novel Representation Learning Framework for Enhanced + Multivariate Time Series Forecasting applied to Telco Networks + + +
+ We introduce SiamTST, a novel representation learning framework for +multivariate time series. SiamTST integrates a Siamese network with attention, +channel-independent patching, and normalization techniques to achieve superior +performance. Evaluated on a real-world industrial telecommunication dataset, +SiamTST demonstrates significant improvements in forecasting accuracy over +existing methods. Notably, a simple linear network also shows competitive +performance, achieving the second-best results, just behind SiamTST. The code +is available at https://github.com/simenkristoff/SiamTST. + +
+
+ comment: 14 pages, 3 figures, public codebase +
+
+
+
+
+ + ☆ Parameter-Selective Continual Test-Time Adaptation + + +
+ Continual Test-Time Adaptation (CTTA) aims to adapt a pretrained model to +ever-changing environments during the test time under continuous domain shifts. +Most existing CTTA approaches are based on the Mean Teacher (MT) structure, +which contains a student and a teacher model, where the student is updated +using the pseudo-labels from the teacher model, and the teacher is then updated +by exponential moving average strategy. However, these methods update the MT +model indiscriminately on all parameters of the model. That is, some critical +parameters involving sharing knowledge across different domains may be erased, +intensifying error accumulation and catastrophic forgetting. In this paper, we +introduce Parameter-Selective Mean Teacher (PSMT) method, which is capable of +effectively updating the critical parameters within the MT network under domain +shifts. First, we introduce a selective distillation mechanism in the student +model, which utilizes past knowledge to regularize novel knowledge, thereby +mitigating the impact of error accumulation. Second, to avoid catastrophic +forgetting, in the teacher model, we create a mask through Fisher information +to selectively update parameters via exponential moving average, with +preservation measures applied to crucial parameters. Extensive experimental +results verify that PSMT outperforms state-of-the-art methods across multiple +benchmark datasets. Our code is available at +\url{https://github.com/JiaxuTian/PSMT}. + +
+
+ comment: 17pages, 4 figures +
+
+
+
+
+ + ☆ MALT Powers Up Adversarial Attacks + + +
+ Current adversarial attacks for multi-class classifiers choose the target +class for a given input naively, based on the classifier's confidence levels +for various target classes. We present a novel adversarial targeting method, +\textit{MALT - Mesoscopic Almost Linearity Targeting}, based on medium-scale +almost linearity assumptions. Our attack wins over the current state of the art +AutoAttack on the standard benchmark datasets CIFAR-100 and ImageNet and for a +variety of robust models. In particular, our attack is \emph{five times faster} +than AutoAttack, while successfully matching all of AutoAttack's successes and +attacking additional samples that were previously out of reach. We then prove +formally and demonstrate empirically that our targeting method, although +inspired by linear predictors, also applies to standard non-linear models. + +
+
+
+
+
+ + ☆ MIREncoder: Multi-modal IR-based Pretrained Embeddings for Performance + Optimizations + + +
+ One of the primary areas of interest in High Performance Computing is the +improvement of performance of parallel workloads. Nowadays, compilable source +code-based optimization tasks that employ deep learning often exploit LLVM +Intermediate Representations (IRs) for extracting features from source code. +Most such works target specific tasks, or are designed with a pre-defined set +of heuristics. So far, pre-trained models are rare in this domain, but the +possibilities have been widely discussed. Especially approaches mimicking +large-language models (LLMs) have been proposed. But these have prohibitively +large training costs. In this paper, we propose MIREncoder, a M}ulti-modal +IR-based Auto-Encoder that can be pre-trained to generate a learned embedding +space to be used for downstream tasks by machine learning-based approaches. A +multi-modal approach enables us to better extract features from compilable +programs. It allows us to better model code syntax, semantics and structure. +For code-based performance optimizations, these features are very important +while making optimization decisions. A pre-trained model/embedding implicitly +enables the usage of transfer learning, and helps move away from task-specific +trained models. Additionally, a pre-trained model used for downstream +performance optimization should itself have reduced overhead, and be easily +usable. These considerations have led us to propose a modeling approach that i) +understands code semantics and structure, ii) enables use of transfer learning, +and iii) is small and simple enough to be easily re-purposed or reused even +with low resource availability. Our evaluations will show that our proposed +approach can outperform the state of the art while reducing overhead. + +
+
+ comment: 12 pages, 6 figures, 9 tables, PACT '24 conference +
+
+
+
+
+ + ☆ Synthetic Multimodal Question Generation + + +
+ Multimodal Retrieval Augmented Generation (MMRAG) is a powerful approach to +question-answering over multimodal documents. A key challenge with evaluating +MMRAG is the paucity of high-quality datasets matching the question styles and +modalities of interest. In light of this, we propose SMMQG, a synthetic data +generation framework. SMMQG leverages interplay between a retriever, large +language model (LLM) and large multimodal model (LMM) to generate question and +answer pairs directly from multimodal documents, with the questions conforming +to specified styles and modalities. We use SMMQG to generate an MMRAG dataset +of 1024 questions over Wikipedia documents and evaluate state-of-the-art models +using it, revealing insights into model performance that are attainable only +through style- and modality-specific evaluation data. Next, we measure the +quality of data produced by SMMQG via a human study. We find that the quality +of our synthetic data is on par with the quality of the crowdsourced benchmark +MMQA and that downstream evaluation results using both datasets strongly +concur. + +
+
+ comment: Submitted to ARR June 2024 +
+
+
+
+
+ + ☆ Safety-Driven Deep Reinforcement Learning Framework for Cobots: A + Sim2Real Approach + + +
+ This study presents a novel methodology incorporating safety constraints into +a robotic simulation during the training of deep reinforcement learning (DRL). +The framework integrates specific parts of the safety requirements, such as +velocity constraints, as specified by ISO 10218, directly within the DRL model +that becomes a part of the robot's learning algorithm. The study then evaluated +the efficiency of these safety constraints by subjecting the DRL model to +various scenarios, including grasping tasks with and without obstacle +avoidance. The validation process involved comprehensive simulation-based +testing of the DRL model's responses to potential hazards and its compliance. +Also, the performance of the system is carried out by the functional safety +standards IEC 61508 to determine the safety integrity level. The study +indicated a significant improvement in the safety performance of the robotic +system. The proposed DRL model anticipates and mitigates hazards while +maintaining operational efficiency. This study was validated in a testbed with +a collaborative robotic arm with safety sensors and assessed with metrics such +as the average number of safety violations, obstacle avoidance, and the number +of successful grasps. The proposed approach outperforms the conventional method +by a 16.5% average success rate on the tested scenarios in the simulations and +2.5% in the testbed without safety violations. The project repository is +available at https://github.com/ammar-n-abbas/sim2real-ur-gym-gazebo. + +
+
+ comment: This paper has been accepted for publication in the proceedings of + the IEEE/IFAC International Conference on Control, Decision, and Information + Technologies (CoDIT), 2024 +
+
+
+
+
+ + ☆ Physics-Informed Model and Hybrid Planning for Efficient Dyna-Style + Reinforcement Learning + + +
+ Applying reinforcement learning (RL) to real-world applications requires +addressing a trade-off between asymptotic performance, sample efficiency, and +inference time. In this work, we demonstrate how to address this triple +challenge by leveraging partial physical knowledge about the system dynamics. +Our approach involves learning a physics-informed model to boost sample +efficiency and generating imaginary trajectories from this model to learn a +model-free policy and Q-function. Furthermore, we propose a hybrid planning +strategy, combining the learned policy and Q-function with the learned model to +enhance time efficiency in planning. Through practical demonstrations, we +illustrate that our method improves the compromise between sample efficiency, +time efficiency, and performance over state-of-the-art methods. + +
+
+
+
+
+ + ☆ PromptIntern: Saving Inference Costs by Internalizing Recurrent Prompt + during Large Language Model Fine-tuning + + +
+ Large language models (LLMs) have played a fundamental role in various +natural language processing tasks with powerful prompt techniques. However, in +real-world applications, there are often similar prompt components for repeated +queries, which causes significant computational burdens during inference. +Existing prompt compression and direct fine-tuning methods aim to tackle these +challenges, yet they frequently struggle to strike an optimal balance between +cost-efficiency and performance effectiveness, especially in complex tasks such +as NL2Code. In this paper, we propose a novel method namely PromptIntern to +internalize the prompt knowledge into model parameters via progressive +fine-tuning. Our method enables LLMs to emulate the human learning process for +a new task, where detailed templates and examples in a prompt are gradually +internalized and phased out progressively as the model grows accustomed to the +task. Extensive experiments demonstrate that our method reduces inference +tokens over 90%, speedups inference by 4.2 times, and saves 88.3% monetary +cost. + +
+
+
+
+
+ + ☆ Attack-Aware Noise Calibration for Differential Privacy + + +
+ Differential privacy (DP) is a widely used approach for mitigating privacy +risks when training machine learning models on sensitive data. DP mechanisms +add noise during training to limit the risk of information leakage. The scale +of the added noise is critical, as it determines the trade-off between privacy +and utility. The standard practice is to select the noise scale in terms of a +privacy budget parameter $\epsilon$. This parameter is in turn interpreted in +terms of operational attack risk, such as accuracy, or sensitivity and +specificity of inference attacks against the privacy of the data. We +demonstrate that this two-step procedure of first calibrating the noise scale +to a privacy budget $\epsilon$, and then translating $\epsilon$ to attack risk +leads to overly conservative risk assessments and unnecessarily low utility. We +propose methods to directly calibrate the noise scale to a desired attack risk +level, bypassing the intermediate step of choosing $\epsilon$. For a target +attack risk, our approach significantly decreases noise scale, leading to +increased utility at the same level of privacy. We empirically demonstrate that +calibrating noise to attack sensitivity/specificity, rather than $\epsilon$, +when training privacy-preserving ML models substantially improves model +accuracy for the same risk level. Our work provides a principled and practical +way to improve the utility of privacy-preserving ML without compromising on +privacy. + +
+
+
+
+
+ + ☆ Structure-Aware Consensus Network on Graphs with Few Labeled Nodes + + +
+ Graph node classification with few labeled nodes presents significant +challenges due to limited supervision. Conventional methods often exploit the +graph in a transductive learning manner. They fail to effectively utilize the +abundant unlabeled data and the structural information inherent in graphs. To +address these issues, we introduce a Structure-Aware Consensus Network (SACN) +from three perspectives. Firstly, SACN leverages a novel structure-aware +consensus learning strategy between two strongly augmented views. The proposed +strategy can fully exploit the potentially useful information of the unlabeled +nodes and the structural information of the entire graph. Secondly, SACN +uniquely integrates the graph's structural information to achieve +strong-to-strong consensus learning, improving the utilization of unlabeled +data while maintaining multiview learning. Thirdly, unlike two-branch graph +neural network-based methods, SACN is designed for multiview feature learning +within a single-branch architecture. Furthermore, a class-aware pseudolabel +selection strategy helps address class imbalance and achieve effective +weak-to-strong supervision. Extensive experiments on three benchmark datasets +demonstrate SACN's superior performance in node classification tasks, +particularly at very low label rates, outperforming state-of-the-art methods +while maintaining computational simplicity.The source code is available at +https://github.com/kunzhan/SACN + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Towards Training Music Taggers on Synthetic Data + + +
+ Most contemporary music tagging systems rely on large volumes of annotated +data. As an alternative, we investigate the extent to which synthetically +generated music excerpts can improve tagging systems when only small annotated +collections are available. To this end, we release GTZAN-synth, a synthetic +dataset that follows the taxonomy of the well-known GTZAN dataset while being +ten times larger in data volume. We first observe that simply adding this +synthetic dataset to the training split of GTZAN does not result into +performance improvements. We then proceed to investigating domain adaptation, +transfer learning and fine-tuning strategies for the task at hand and draw the +conclusion that the last two options yield an increase in accuracy. Overall, +the proposed approach can be considered as a first guide in a promising field +for future research. + +
+
+ comment: 6 pages, 3 figures, accepted to 21st International Conference on + Content-based Multimedia Indexing (CBMI) 2024, code available + https://github.com/NadineKroher/music-tagging-synthetic-data-cbmi-2024 +
+
+
+
+
+ + ☆ Equidistribution-based training of Free Knot Splines and ReLU Neural + Networks + + +
+ We consider the problem of one-dimensional function approximation using +shallow neural networks (NN) with a rectified linear unit (ReLU) activation +function and compare their training with traditional methods such as univariate +Free Knot Splines (FKS). ReLU NNs and FKS span the same function space, and +thus have the same theoretical expressivity. In the case of ReLU NNs, we show +that their ill-conditioning degrades rapidly as the width of the network +increases. This often leads to significantly poorer approximation in contrast +to the FKS representation, which remains well-conditioned as the number of +knots increases. We leverage the theory of optimal piecewise linear +interpolants to improve the training procedure for a ReLU NN. Using the +equidistribution principle, we propose a two-level procedure for training the +FKS by first solving the nonlinear problem of finding the optimal knot +locations of the interpolating FKS. Determining the optimal knots then acts as +a good starting point for training the weights of the FKS. The training of the +FKS gives insights into how we can train a ReLU NN effectively to give an +equally accurate approximation. More precisely, we combine the training of the +ReLU NN with an equidistribution based loss to find the breakpoints of the ReLU +functions, combined with preconditioning the ReLU NN approximation (to take an +FKS form) to find the scalings of the ReLU functions, leads to a +well-conditioned and reliable method of finding an accurate ReLU NN +approximation to a target function. We test this method on a series or regular, +singular, and rapidly varying target functions and obtain good results +realising the expressivity of the network in this case. + +
+
+
+
+
+ + ☆ Counterfactual Data Augmentation with Denoising Diffusion for Graph + Anomaly Detection + + +
+ A critical aspect of Graph Neural Networks (GNNs) is to enhance the node +representations by aggregating node neighborhood information. However, when +detecting anomalies, the representations of abnormal nodes are prone to be +averaged by normal neighbors, making the learned anomaly representations less +distinguishable. To tackle this issue, we propose CAGAD -- an unsupervised +Counterfactual data Augmentation method for Graph Anomaly Detection -- which +introduces a graph pointer neural network as the heterophilic node detector to +identify potential anomalies whose neighborhoods are normal-node-dominant. For +each identified potential anomaly, we design a graph-specific diffusion model +to translate a part of its neighbors, which are probably normal, into anomalous +ones. At last, we involve these translated neighbors in GNN neighborhood +aggregation to produce counterfactual representations of anomalies. Through +aggregating the translated anomalous neighbors, counterfactual representations +become more distinguishable and further advocate detection performance. The +experimental results on four datasets demonstrate that CAGAD significantly +outperforms strong baselines, with an average improvement of 2.35% on F1, 2.53% +on AUC-ROC, and 2.79% on AUC-PR. + +
+
+ comment: Accepted by IEEE Transactions on Computational Social Systems(TCSS). + DOI: https://doi.org/10.1109/TCSS.2024.3403503 +
+
+
+
+
+ + ☆ Efficient Nearest Neighbor based Uncertainty Estimation for Natural + Language Processing Tasks + + +
+ Trustworthy prediction in Deep Neural Networks (DNNs), including Pre-trained +Language Models (PLMs) is important for safety-critical applications in the +real world. However, DNNs often suffer from uncertainty estimation, such as +miscalibration. In particular, approaches that require multiple stochastic +inference can mitigate this problem, but the expensive cost of inference makes +them impractical. In this study, we propose $k$-Nearest Neighbor Uncertainty +Estimation ($k$NN-UE), which is an uncertainty estimation method that uses the +distances from the neighbors and label-existence ratio of neighbors. +Experiments on sentiment analysis, natural language inference, and named entity +recognition show that our proposed method outperforms the baselines or recent +density-based methods in confidence calibration, selective prediction, and +out-of-distribution detection. Moreover, our analyses indicate that introducing +dimension reduction or approximate nearest neighbor search inspired by recent +$k$NN-LM studies reduces the inference overhead without significantly degrading +estimation performance when combined them appropriately. + +
+
+
+
+
+ + ☆ Distributional Regression U-Nets for the Postprocessing of Precipitation + Ensemble Forecasts + + +
+ Accurate precipitation forecasts have a high socio-economic value due to +their role in decision-making in various fields such as transport networks and +farming. We propose a global statistical postprocessing method for grid-based +precipitation ensemble forecasts. This U-Net-based distributional regression +method predicts marginal distributions in the form of parametric distributions +inferred by scoring rule minimization. Distributional regression U-Nets are +compared to state-of-the-art postprocessing methods for daily 21-h forecasts of +3-h accumulated precipitation over the South of France. Training data comes +from the M\'et\'eo-France weather model AROME-EPS and spans 3 years. A +practical challenge appears when consistent data or reforecasts are not +available. + Distributional regression U-Nets compete favorably with the raw ensemble. In +terms of continuous ranked probability score, they reach a performance +comparable to quantile regression forests (QRF). However, they are unable to +provide calibrated forecasts in areas associated with high climatological +precipitation. In terms of predictive power for heavy precipitation events, +they outperform both QRF and semi-parametric QRF with tail extensions. + +
+
+ comment: for associated code, see https://github.com/pic-romain/unet-pp +
+
+
+
+
+ + ☆ Cost-Effective Proxy Reward Model Construction with On-Policy and Active + Learning + + +
+ Reinforcement learning with human feedback (RLHF), as a widely adopted +approach in current large language model pipelines, is \textit{bottlenecked by +the size of human preference data}. While traditional methods rely on offline +preference dataset constructions, recent approaches have shifted towards online +settings, where a learner uses a small amount of labeled seed data and a large +pool of unlabeled prompts to iteratively construct new preference data through +self-generated responses and high-quality reward/preference feedback. However, +most current online algorithms still focus on preference labeling during policy +model updating with given feedback oracles, which incurs significant expert +query costs. \textit{We are the first to explore cost-effective proxy reward +oracles construction strategies for further labeling preferences or rewards +with extremely limited labeled data and expert query budgets}. Our approach +introduces two key innovations: (1) on-policy query to avoid OOD and imbalance +issues in seed data, and (2) active learning to select the most informative +data for preference queries. Using these methods, we train a evaluation model +with minimal expert-labeled data, which then effectively labels nine times more +preference pairs for further RLHF training. For instance, our model using +Direct Preference Optimization (DPO) gains around over 1% average improvement +on AlpacaEval2, MMLU-5shot and MMLU-0shot, with only 1.7K query cost. Our +methodology is orthogonal to other direct expert query-based strategies and +therefore might be integrated with them to further reduce query costs. + +
+
+
+
+
+ + ☆ A Data-Centric Perspective on Evaluating Machine Learning Models for + Tabular Data + + +
+ Tabular data is prevalent in real-world machine learning applications, and +new models for supervised learning of tabular data are frequently proposed. +Comparative studies assessing the performance of models typically consist of +model-centric evaluation setups with overly standardized data preprocessing. +This paper demonstrates that such model-centric evaluations are biased, as +real-world modeling pipelines often require dataset-specific preprocessing and +feature engineering. Therefore, we propose a data-centric evaluation framework. +We select 10 relevant datasets from Kaggle competitions and implement +expert-level preprocessing pipelines for each dataset. We conduct experiments +with different preprocessing pipelines and hyperparameter optimization (HPO) +regimes to quantify the impact of model selection, HPO, feature engineering, +and test-time adaptation. Our main findings are: 1. After dataset-specific +feature engineering, model rankings change considerably, performance +differences decrease, and the importance of model selection reduces. 2. Recent +models, despite their measurable progress, still significantly benefit from +manual feature engineering. This holds true for both tree-based models and +neural networks. 3. While tabular data is typically considered static, samples +are often collected over time, and adapting to distribution shifts can be +important even in supposedly static data. These insights suggest that research +efforts should be directed toward a data-centric perspective, acknowledging +that tabular data requires feature engineering and often exhibits temporal +characteristics. + +
+
+
+
+
+ + ☆ Automated Knowledge Graph Learning in Industrial Processes + + +
+ Industrial processes generate vast amounts of time series data, yet +extracting meaningful relationships and insights remains challenging. This +paper introduces a framework for automated knowledge graph learning from time +series data, specifically tailored for industrial applications. Our framework +addresses the complexities inherent in industrial datasets, transforming them +into knowledge graphs that improve decision-making, process optimization, and +knowledge discovery. Additionally, it employs Granger causality to identify key +attributes that can inform the design of predictive models. To illustrate the +practical utility of our approach, we also present a motivating use case +demonstrating the benefits of our framework in a real-world industrial +scenario. Further, we demonstrate how the automated conversion of time series +data into knowledge graphs can identify causal influences or dependencies +between important process parameters. + +
+
+
+
+
+ + ☆ Efficient Bit Labeling in Factorization Machines with Annealing for + Traveling Salesman Problem + + +
+ To efficiently find an optimum parameter combination in a large-scale +problem, it is a key to convert the parameters into available variables in +actual machines. Specifically, quadratic unconstrained binary optimization +problems are solved with the help of machine learning, e.g., factorization +machines with annealing, which convert a raw parameter to binary variables. +This work investigates the dependence of the convergence speed and the accuracy +on binary labeling method, which can influence the cost function shape and thus +the probability of being captured at a local minimum solution. By exemplifying +traveling salesman problem, we propose and evaluate Gray labeling, which +correlates the Hamming distance in binary labels with the traveling distance. +Through numerical simulation of traveling salesman problem up to 15 cities at a +limited number of iterations, the Gray labeling shows less local minima +percentages and shorter traveling distances compared with natural labeling. + +
+
+
+
+
+ + ☆ GPTCast: a weather language model for precipitation nowcasting + + +
+ This work introduces GPTCast, a generative deep-learning method for ensemble +nowcast of radar-based precipitation, inspired by advancements in large +language models (LLMs). We employ a GPT model as a forecaster to learn +spatiotemporal precipitation dynamics using tokenized radar images. The +tokenizer is based on a Quantized Variational Autoencoder featuring a novel +reconstruction loss tailored for the skewed distribution of precipitation that +promotes faithful reconstruction of high rainfall rates. The approach produces +realistic ensemble forecasts and provides probabilistic outputs with accurate +uncertainty estimation. The model is trained without resorting to randomness, +all variability is learned solely from the data and exposed by model at +inference for ensemble generation. We train and test GPTCast using a 6-year +radar dataset over the Emilia-Romagna region in Northern Italy, showing +superior results compared to state-of-the-art ensemble extrapolation methods. + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ Contribution Evaluation of Heterogeneous Participants in Federated + Learning via Prototypical Representations + + +
+ Contribution evaluation in federated learning (FL) has become a pivotal +research area due to its applicability across various domains, such as +detecting low-quality datasets, enhancing model robustness, and designing +incentive mechanisms. Existing contribution evaluation methods, which primarily +rely on data volume, model similarity, and auxiliary test datasets, have shown +success in diverse scenarios. However, their effectiveness often diminishes due +to the heterogeneity of data distributions, presenting a significant challenge +to their applicability. In response, this paper explores contribution +evaluation in FL from an entirely new perspective of representation. In this +work, we propose a new method for the contribution evaluation of heterogeneous +participants in federated learning (FLCE), which introduces a novel indicator +\emph{class contribution momentum} to conduct refined contribution evaluation. +Our core idea is the construction and application of the class contribution +momentum indicator from individual, relative, and holistic perspectives, +thereby achieving an effective and efficient contribution evaluation of +heterogeneous participants without relying on an auxiliary test dataset. +Extensive experimental results demonstrate the superiority of our method in +terms of fidelity, effectiveness, efficiency, and heterogeneity across various +scenarios. + +
+
+
+
+
+ + ☆ Latent Diffusion Model for Generating Ensembles of Climate Simulations ICML 2024 + + +
+ Obtaining accurate estimates of uncertainty in climate scenarios often +requires generating large ensembles of high-resolution climate simulations, a +computationally expensive and memory intensive process. To address this +challenge, we train a novel generative deep learning approach on extensive sets +of climate simulations. The model consists of two components: a variational +autoencoder for dimensionality reduction and a denoising diffusion +probabilistic model that generates multiple ensemble members. We validate our +model on the Max Planck Institute Grand Ensemble and show that it achieves good +agreement with the original ensemble in terms of variability. By leveraging the +latent space representation, our model can rapidly generate large ensembles +on-the-fly with minimal memory requirements, which can significantly improve +the efficiency of uncertainty quantification in climate simulations. + +
+
+ comment: 8 pages, 7 figures, Accepted at the ICML 2024 Machine Learning for + Earth System Modeling workshop +
+
+
+
+
+ + ☆ Are Data Augmentation Methods in Named Entity Recognition Applicable for + Uncertainty Estimation? + + +
+ This work investigates the impact of data augmentation on confidence +calibration and uncertainty estimation in Named Entity Recognition (NER) tasks. +For the future advance of NER in safety-critical fields like healthcare and +finance, it is essential to achieve accurate predictions with calibrated +confidence when applying Deep Neural Networks (DNNs), including Pre-trained +Language Models (PLMs), as a real-world application. However, DNNs are prone to +miscalibration, which limits their applicability. Moreover, existing methods +for calibration and uncertainty estimation are computational expensive. Our +investigation in NER found that data augmentation improves calibration and +uncertainty in cross-genre and cross-lingual setting, especially in-domain +setting. Furthermore, we showed that the calibration for NER tends to be more +effective when the perplexity of the sentences generated by data augmentation +is lower, and that increasing the size of the augmentation further improves +calibration and uncertainty. + +
+
+
+
+
+ + ☆ Terminating Differentiable Tree Experts + + +
+ We advance the recently proposed neuro-symbolic Differentiable Tree Machine, +which learns tree operations using a combination of transformers and Tensor +Product Representations. We investigate the architecture and propose two key +components. We first remove a series of different transformer layers that are +used in every step by introducing a mixture of experts. This results in a +Differentiable Tree Experts model with a constant number of parameters for any +arbitrary number of steps in the computation, compared to the previous method +in the Differentiable Tree Machine with a linear growth. Given this flexibility +in the number of steps, we additionally propose a new termination algorithm to +provide the model the power to choose how many steps to make automatically. The +resulting Terminating Differentiable Tree Experts model sluggishly learns to +predict the number of steps without an oracle. It can do so while maintaining +the learning capabilities of the model, converging to the optimal amount of +steps. + +
+
+ comment: Accepted at the 18th International Conference on Neural-Symbolic + Learning and Reasoning (NeSy) 2024 +
+
+
+
+
+ + ☆ HC-GLAD: Dual Hyperbolic Contrastive Learning for Unsupervised + Graph-Level Anomaly Detection + + +
+ Unsupervised graph-level anomaly detection (UGAD) has garnered increasing +attention in recent years due to its significance. However, most existing +methods only rely on traditional graph neural networks to explore pairwise +relationships but such kind of pairwise edges are not enough to describe +multifaceted relationships involving anomaly. There is an emergency need to +exploit node group information which plays a crucial role in UGAD. In addition, +most previous works ignore the global underlying properties (e.g., hierarchy +and power-law structure) which are common in real-world graph datasets and +therefore are indispensable factors on UGAD task. In this paper, we propose a +novel Dual Hyperbolic Contrastive Learning for Unsupervised Graph-Level Anomaly +Detection (HC-GLAD in short). To exploit node group connections, we construct +hypergraphs based on gold motifs and subsequently perform hypergraph +convolution. Furthermore, to preserve the hierarchy of real-world graphs, we +introduce hyperbolic geometry into this field and conduct both graph and +hypergraph embedding learning in hyperbolic space with hyperboloid model. To +the best of our knowledge, this is the first work to simultaneously apply +hypergraph with node group connections and hyperbolic geometry into this field. +Extensive experiments on several real world datasets of different fields +demonstrate the superiority of HC-GLAD on UGAD task. The code is available at +https://github.com/Yali-F/HC-GLAD. + +
+
+
+
+
+ + ☆ SwiftDiffusion: Efficient Diffusion Model Serving with Add-on Modules + + +
+ This paper documents our characterization study and practices for serving +text-to-image requests with stable diffusion models in production. We first +comprehensively analyze inference request traces for commercial text-to-image +applications. It commences with our observation that add-on modules, i.e., +ControlNets and LoRAs, that augment the base stable diffusion models, are +ubiquitous in generating images for commercial applications. Despite their +efficacy, these add-on modules incur high loading overhead, prolong the serving +latency, and swallow up expensive GPU resources. Driven by our characterization +study, we present SwiftDiffusion, a system that efficiently generates +high-quality images using stable diffusion models and add-on modules. To +achieve this, SwiftDiffusion reconstructs the existing text-to-image serving +workflow by identifying the opportunities for parallel computation and +distributing ControlNet computations across multiple GPUs. Further, +SwiftDiffusion thoroughly analyzes the dynamics of image generation and +develops techniques to eliminate the overhead associated with LoRA loading and +patching while preserving the image quality. Last, SwiftDiffusion proposes +specialized optimizations in the backbone architecture of the stable diffusion +models, which are also compatible with the efficient serving of add-on modules. +Compared to state-of-the-art text-to-image serving systems, SwiftDiffusion +reduces serving latency by up to 5x and improves serving throughput by up to 2x +without compromising image quality. + +
+
+
+
+
+ + ☆ Why does in-context learning fail sometimes? Evaluating in-context + learning on open and closed questions + + +
+ We measure the performance of in-context learning as a function of task +novelty and difficulty for open and closed questions. For that purpose, we +created a novel benchmark consisting of hard scientific questions, each paired +with a context of various relevancy. We show that counter-intuitively, a +context that is more aligned with the topic does not always help more than a +less relevant context. This effect is especially visible for open questions and +questions of high difficulty or novelty. This result reveals a fundamental +difference between the treatment of close-form and open-form questions by +large-language models and shows a need for a more robust evaluation of +in-context learning on the variety of different types of questions. It also +poses a new question of how to optimally select a context for large language +models, especially in the context of Retrieval Augmented Generation (RAG) +systems. Our results suggest that the answer to this question can be highly +application-dependent and might be contingent on factors including the format +of the question, the perceived difficulty level of the questions, and the +novelty or popularity of the information we seek. + +
+
+ comment: 8 pages plus references, 4 main figures, 6 pages of supplementary + material +
+
+
+
+
+ + ☆ On the Expressive Power of Sparse Geometric MPNNs + + +
+ Motivated by applications in chemistry and other sciences, we study the +expressive power of message-passing neural networks for geometric graphs, whose +node features correspond to 3-dimensional positions. Recent work has shown that +such models can separate generic pairs of non-equivalent geometric graphs, +though they may fail to separate some rare and complicated instances. However, +these results assume a fully connected graph, where each node possesses +complete knowledge of all other nodes. In contrast, often, in application, +every node only possesses knowledge of a small number of nearest neighbors. +This paper shows that generic pairs of non-equivalent geometric graphs can be +separated by message-passing networks with rotation equivariant features as +long as the underlying graph is connected. When only invariant intermediate +features are allowed, generic separation is guaranteed for generically globally +rigid graphs. We introduce a simple architecture, EGENNET, which achieves our +theoretical guarantees and compares favorably with alternative architecture on +synthetic and chemical benchmarks. + +
+
+
+
+
+ + ☆ DiGRAF: Diffeomorphic Graph-Adaptive Activation Function + + +
+ In this paper, we propose a novel activation function tailored specifically +for graph data in Graph Neural Networks (GNNs). Motivated by the need for +graph-adaptive and flexible activation functions, we introduce DiGRAF, +leveraging Continuous Piecewise-Affine Based (CPAB) transformations, which we +augment with an additional GNN to learn a graph-adaptive diffeomorphic +activation function in an end-to-end manner. In addition to its +graph-adaptivity and flexibility, DiGRAF also possesses properties that are +widely recognized as desirable for activation functions, such as +differentiability, boundness within the domain and computational efficiency. We +conduct an extensive set of experiments across diverse datasets and tasks, +demonstrating a consistent and superior performance of DiGRAF compared to +traditional and graph-specific activation functions, highlighting its +effectiveness as an activation function for GNNs. + +
+
+
+
+
+ + ☆ Feynman-Kac Operator Expectation Estimator + + +
+ The Feynman-Kac Operator Expectation Estimator (FKEE) is an innovative method +for estimating the target Mathematical Expectation $\mathbb{E}_{X\sim P}[f(X)]$ +without relying on a large number of samples, in contrast to the commonly used +Markov Chain Monte Carlo (MCMC) Expectation Estimator. FKEE comprises diffusion +bridge models and approximation of the Feynman-Kac operator. The key idea is to +use the solution to the Feynmann-Kac equation at the initial time +$u(x_0,0)=\mathbb{E}[f(X_T)|X_0=x_0]$. We use Physically Informed Neural +Networks (PINN) to approximate the Feynman-Kac operator, which enables the +incorporation of diffusion bridge models into the expectation estimator and +significantly improves the efficiency of using data while substantially +reducing the variance. Diffusion Bridge Model is a more general MCMC method. In +order to incorporate extensive MCMC algorithms, we propose a new diffusion +bridge model based on the Minimum Wasserstein distance. This diffusion bridge +model is universal and reduces the training time of the PINN. FKEE also reduces +the adverse impact of the curse of dimensionality and weakens the assumptions +on the distribution of $X$ and performance function $f$ in the general MCMC +expectation estimator. The theoretical properties of this universal diffusion +bridge model are also shown. Finally, we demonstrate the advantages and +potential applications of this method through various concrete experiments, +including the challenging task of approximating the partition function in the +random graph model such as the Ising model. + +
+
+
+
+
+ + ☆ Generation of Geodesics with Actor-Critic Reinforcement Learning to + Predict Midpoints + + +
+ To find the shortest paths for all pairs on continuous manifolds with +infinitesimally defined metrics, we propose to generate them by predicting +midpoints recursively and an actor-critic method to learn midpoint prediction. +We prove the soundness of our approach and show experimentally that the +proposed method outperforms existing methods on both local and global path +planning tasks. + +
+
+ comment: 15 pages with 6 pages of appendices and references, 8 figures +
+
+
+
+
+ + ☆ The Epistemic Uncertainty Hole: an issue of Bayesian Neural Networks + + +
+ Bayesian Deep Learning (BDL) gives access not only to aleatoric uncertainty, +as standard neural networks already do, but also to epistemic uncertainty, a +measure of confidence a model has in its own predictions. In this article, we +show through experiments that the evolution of epistemic uncertainty metrics +regarding the model size and the size of the training set, goes against +theoretical expectations. More precisely, we observe that the epistemic +uncertainty collapses literally in the presence of large models and sometimes +also of little training data, while we expect the exact opposite behaviour. +This phenomenon, which we call "epistemic uncertainty hole", is all the more +problematic as it undermines the entire applicative potential of BDL, which is +based precisely on the use of epistemic uncertainty. As an example, we evaluate +the practical consequences of this uncertainty hole on one of the main +applications of BDL, namely the detection of out-of-distribution samples + +
+
+
+
+
+ + ☆ Unveiling Global Interactive Patterns across Graphs: Towards + Interpretable Graph Neural Networks KDD2024 + + +
+ Graph Neural Networks (GNNs) have emerged as a prominent framework for graph +mining, leading to significant advances across various domains. Stemmed from +the node-wise representations of GNNs, existing explanation studies have +embraced the subgraph-specific viewpoint that attributes the decision results +to the salient features and local structures of nodes. However, graph-level +tasks necessitate long-range dependencies and global interactions for advanced +GNNs, deviating significantly from subgraph-specific explanations. To bridge +this gap, this paper proposes a novel intrinsically interpretable scheme for +graph classification, termed as Global Interactive Pattern (GIP) learning, +which introduces learnable global interactive patterns to explicitly interpret +decisions. GIP first tackles the complexity of interpretation by clustering +numerous nodes using a constrained graph clustering module. Then, it matches +the coarsened global interactive instance with a batch of self-interpretable +graph prototypes, thereby facilitating a transparent graph-level reasoning +process. Extensive experiments conducted on both synthetic and real-world +benchmarks demonstrate that the proposed GIP yields significantly superior +interpretability and competitive performance to~the state-of-the-art +counterparts. Our code will be made publicly available. + +
+
+ comment: Accepted in KDD2024 +
+
+
+
+
+ + ☆ MeMemo: On-device Retrieval Augmentation for Private and Personalized + Text Generation SIGIR 2024 + + +
+ Retrieval-augmented text generation (RAG) addresses the common limitations of +large language models (LLMs), such as hallucination, by retrieving information +from an updatable external knowledge base. However, existing approaches often +require dedicated backend servers for data storage and retrieval, thereby +limiting their applicability in use cases that require strict data privacy, +such as personal finance, education, and medicine. To address the pressing need +for client-side dense retrieval, we introduce MeMemo, the first open-source +JavaScript toolkit that adapts the state-of-the-art approximate nearest +neighbor search technique HNSW to browser environments. Developed with modern +and native Web technologies, such as IndexedDB and Web Workers, our toolkit +leverages client-side hardware capabilities to enable researchers and +developers to efficiently search through millions of high-dimensional vectors +in the browser. MeMemo enables exciting new design and research opportunities, +such as private and personalized content creation and interactive prototyping, +as demonstrated in our example application RAG Playground. Reflecting on our +work, we discuss the opportunities and challenges for on-device dense +retrieval. MeMemo is available at https://github.com/poloclub/mememo. + +
+
+ comment: Accepted to SIGIR 2024. 6 pages, 2 figures. For a live demo, visit + https://poloclub.github.io/mememo/. Code is open-source at + https://github.com/poloclub/mememo +
+
+
+
+
+ + ☆ Zero-shot Video Restoration and Enhancement Using Pre-Trained Image + Diffusion Model + + +
+ Diffusion-based zero-shot image restoration and enhancement models have +achieved great success in various image restoration and enhancement tasks +without training. However, directly applying them to video restoration and +enhancement results in severe temporal flickering artifacts. In this paper, we +propose the first framework for zero-shot video restoration and enhancement +based on a pre-trained image diffusion model. By replacing the self-attention +layer with the proposed cross-previous-frame attention layer, the pre-trained +image diffusion model can take advantage of the temporal correlation between +neighboring frames. We further propose temporal consistency guidance, +spatial-temporal noise sharing, and an early stopping sampling strategy for +better temporally consistent sampling. Our method is a plug-and-play module +that can be inserted into any diffusion-based zero-shot image restoration or +enhancement methods to further improve their performance. Experimental results +demonstrate the superiority of our proposed method in producing temporally +consistent videos with better fidelity. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ CatMemo at the FinLLM Challenge Task: Fine-Tuning Large Language Models + using Data Fusion in Financial Applications + + +
+ The integration of Large Language Models (LLMs) into financial analysis has +garnered significant attention in the NLP community. This paper presents our +solution to IJCAI-2024 FinLLM challenge, investigating the capabilities of LLMs +within three critical areas of financial tasks: financial classification, +financial text summarization, and single stock trading. We adopted Llama3-8B +and Mistral-7B as base models, fine-tuning them through Parameter Efficient +Fine-Tuning (PEFT) and Low-Rank Adaptation (LoRA) approaches. To enhance model +performance, we combine datasets from task 1 and task 2 for data fusion. Our +approach aims to tackle these diverse tasks in a comprehensive and integrated +manner, showcasing LLMs' capacity to address diverse and complex financial +tasks with improved accuracy and decision-making capabilities. + +
+
+
+
+
+ + ☆ Extracting and Encoding: Leveraging Large Language Models and Medical + Knowledge to Enhance Radiological Text Representation ACL 2024 + + +
+ Advancing representation learning in specialized fields like medicine remains +challenging due to the scarcity of expert annotations for text and images. To +tackle this issue, we present a novel two-stage framework designed to extract +high-quality factual statements from free-text radiology reports in order to +improve the representations of text encoders and, consequently, their +performance on various downstream tasks. In the first stage, we propose a +\textit{Fact Extractor} that leverages large language models (LLMs) to identify +factual statements from well-curated domain-specific datasets. In the second +stage, we introduce a \textit{Fact Encoder} (CXRFE) based on a BERT model +fine-tuned with objective functions designed to improve its representations +using the extracted factual data. Our framework also includes a new +embedding-based metric (CXRFEScore) for evaluating chest X-ray text generation +systems, leveraging both stages of our approach. Extensive evaluations show +that our fact extractor and encoder outperform current state-of-the-art methods +in tasks such as sentence ranking, natural language inference, and label +extraction from radiology reports. Additionally, our metric proves to be more +robust and effective than existing metrics commonly used in the radiology +report generation literature. The code of this project is available at +\url{https://github.com/PabloMessina/CXR-Fact-Encoder}. + +
+
+ comment: Accepted to ACL 2024 (Findings) +
+
+
+
+
+ + ☆ To Forget or Not? Towards Practical Knowledge Unlearning for Large + Language Models + + +
+ Large Language Models (LLMs) trained on extensive corpora inevitably retain +sensitive data, such as personal privacy information and copyrighted material. +Recent advancements in knowledge unlearning involve updating LLM parameters to +erase specific knowledge. However, current unlearning paradigms are mired in +vague forgetting boundaries, often erasing knowledge indiscriminately. In this +work, we introduce KnowUnDo, a benchmark containing copyrighted content and +user privacy domains to evaluate if the unlearning process inadvertently erases +essential knowledge. Our findings indicate that existing unlearning methods +often suffer from excessive unlearning. To address this, we propose a simple +yet effective method, MemFlex, which utilizes gradient information to precisely +target and unlearn sensitive parameters. Experimental results show that MemFlex +is superior to existing methods in both precise knowledge unlearning and +general knowledge retaining of LLMs. Code and dataset will be released at +https://github.com/zjunlp/KnowUnDo. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ MG-Verilog: Multi-grained Dataset Towards Enhanced LLM-assisted Verilog + Generation + + +
+ Large Language Models (LLMs) have recently shown promise in streamlining +hardware design processes by encapsulating vast amounts of domain-specific +data. In addition, they allow users to interact with the design processes +through natural language instructions, thus making hardware design more +accessible to developers. However, effectively leveraging LLMs in hardware +design necessitates providing domain-specific data during inference (e.g., +through in-context learning), fine-tuning, or pre-training. Unfortunately, +existing publicly available hardware datasets are often limited in size, +complexity, or detail, which hinders the effectiveness of LLMs in hardware +design tasks. To address this issue, we first propose a set of criteria for +creating high-quality hardware datasets that can effectively enhance +LLM-assisted hardware design. Based on these criteria, we propose a +Multi-Grained-Verilog (MG-Verilog) dataset, which encompasses descriptions at +various levels of detail and corresponding code samples. To benefit the broader +hardware design community, we have developed an open-source infrastructure that +facilitates easy access, integration, and extension of the dataset to meet +specific project needs. Furthermore, to fully exploit the potential of the +MG-Verilog dataset, which varies in complexity and detail, we introduce a +balanced fine-tuning scheme. This scheme serves as a unique use case to +leverage the diverse levels of detail provided by the dataset. Extensive +experiments demonstrate that the proposed dataset and fine-tuning scheme +consistently improve the performance of LLMs in hardware design tasks. + +
+
+ comment: Accepted in ISLAD 2024 +
+
+
+
+
+ + ☆ The Solution for the ICCV 2023 Perception Test Challenge 2023 -- Task 6 + -- Grounded videoQA + + +
+ In this paper, we introduce a grounded video question-answering solution. Our +research reveals that the fixed official baseline method for video question +answering involves two main steps: visual grounding and object tracking. +However, a significant challenge emerges during the initial step, where +selected frames may lack clearly identifiable target objects. Furthermore, +single images cannot address questions like "Track the container from which the +person pours the first time." To tackle this issue, we propose an alternative +two-stage approach:(1) First, we leverage the VALOR model to answer questions +based on video information.(2) concatenate the answered questions with their +respective answers. Finally, we employ TubeDETR to generate bounding boxes for +the targets. + +
+
+
+
+
+ + ☆ Let the Expert Stick to His Last: Expert-Specialized Fine-Tuning for + Sparse Architectural Large Language Models + + +
+ Parameter-efficient fine-tuning (PEFT) is crucial for customizing Large +Language Models (LLMs) with constrained resources. Although there have been +various PEFT methods for dense-architecture LLMs, PEFT for sparse-architecture +LLMs is still underexplored. In this work, we study the PEFT method for LLMs +with the Mixture-of-Experts (MoE) architecture and the contents of this work +are mainly threefold: (1) We investigate the dispersion degree of the activated +experts in customized tasks, and found that the routing distribution for a +specific task tends to be highly concentrated, while the distribution of +activated experts varies significantly across different tasks. (2) We propose +Expert-Specialized Fine-Tuning, or ESFT, which tunes the experts most relevant +to downstream tasks while freezing the other experts and modules; experimental +results demonstrate that our method not only improves the tuning efficiency, +but also matches or even surpasses the performance of full-parameter +fine-tuning. (3) We further analyze the impact of the MoE architecture on +expert-specialized fine-tuning. We find that MoE models with finer-grained +experts are more advantageous in selecting the combination of experts that are +most relevant to downstream tasks, thereby enhancing both the training +efficiency and effectiveness. + +
+
+
+
+
+ + ☆ Text-Aware Diffusion for Policy Learning + + +
+ Training an agent to achieve particular goals or perform desired behaviors is +often accomplished through reinforcement learning, especially in the absence of +expert demonstrations. However, supporting novel goals or behaviors through +reinforcement learning requires the ad-hoc design of appropriate reward +functions, which quickly becomes intractable. To address this challenge, we +propose Text-Aware Diffusion for Policy Learning (TADPoLe), which uses a +pretrained, frozen text-conditioned diffusion model to compute dense zero-shot +reward signals for text-aligned policy learning. We hypothesize that +large-scale pretrained generative models encode rich priors that can supervise +a policy to behave not only in a text-aligned manner, but also in alignment +with a notion of naturalness summarized from internet-scale training data. In +our experiments, we demonstrate that TADPoLe is able to learn policies for +novel goal-achievement and continuous locomotion behaviors specified by natural +language, in both Humanoid and Dog environments. The behaviors are learned +zero-shot without ground-truth rewards or expert demonstrations, and are +qualitatively more natural according to human evaluation. We further show that +TADPoLe performs competitively when applied to robotic manipulation tasks in +the Meta-World environment. + +
+
+
+
+
+ + ☆ Beyond Numeric Awards: In-Context Dueling Bandits with LLM Agents + + +
+ In-context decision-making is an important capability of artificial general +intelligence, which Large Language Models (LLMs) have effectively demonstrated +in various scenarios. However, LLMs often face challenges when dealing with +numerical contexts, and limited attention has been paid to evaluating their +performance through preference feedback generated by the environment. This +paper investigates the performance of LLMs as decision-makers in the context of +Dueling Bandits (DB). We first evaluate the performance of LLMs by comparing +GPT-3.5-Turbo, GPT-4, and GPT-4-Turbo against established DB algorithms. Our +results reveal that LLMs, particularly GPT-4 Turbo, quickly identify the +Condorcet winner, thus outperforming existing state-of-the-art algorithms in +terms of weak regret. Nevertheless, LLMs struggle to converge even when +explicitly prompted to do so, and are sensitive to prompt variations. To +overcome these issues, we introduce an LLM-augmented algorithm, IF-Enhanced +LLM, which takes advantage of both in-context decision-making capabilities of +LLMs and theoretical guarantees inherited from classic DB algorithms. The +design of such an algorithm sheds light on how to enhance trustworthiness for +LLMs used in decision-making tasks where performance robustness matters. We +show that IF-Enhanced LLM has theoretical guarantees on both weak and strong +regret. Our experimental results validate that IF-Enhanced LLM is robust even +with noisy and adversarial prompts. + +
+
+
+
+
+ + ☆ Core Knowledge Learning Framework for Graph Adaptation and Scalability + Learning + + +
+ Graph classification is a pivotal challenge in machine learning, especially +within the realm of graph-based data, given its importance in numerous +real-world applications such as social network analysis, recommendation +systems, and bioinformatics. Despite its significance, graph classification +faces several hurdles, including adapting to diverse prediction tasks, training +across multiple target domains, and handling small-sample prediction scenarios. +Current methods often tackle these challenges individually, leading to +fragmented solutions that lack a holistic approach to the overarching problem. +In this paper, we propose an algorithm aimed at addressing the aforementioned +challenges. By incorporating insights from various types of tasks, our method +aims to enhance adaptability, scalability, and generalizability in graph +classification. Motivated by the recognition that the underlying subgraph plays +a crucial role in GNN prediction, while the remainder is task-irrelevant, we +introduce the Core Knowledge Learning (\method{}) framework for graph +adaptation and scalability learning. \method{} comprises several key modules, +including the core subgraph knowledge submodule, graph domain adaptation +module, and few-shot learning module for downstream tasks. Each module is +tailored to tackle specific challenges in graph classification, such as domain +shift, label inconsistencies, and data scarcity. By learning the core subgraph +of the entire graph, we focus on the most pertinent features for task +relevance. Consequently, our method offers benefits such as improved model +performance, increased domain adaptability, and enhanced robustness to domain +variations. Experimental results demonstrate significant performance +enhancements achieved by our method compared to state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Automated Text Scoring in the Age of Generative AI for the GPU-poor + + +
+ Current research on generative language models (GLMs) for automated text +scoring (ATS) has focused almost exclusively on querying proprietary models via +Application Programming Interfaces (APIs). Yet such practices raise issues +around transparency and security, and these methods offer little in the way of +efficiency or customizability. With the recent proliferation of smaller, +open-source models, there is the option to explore GLMs with computers equipped +with modest, consumer-grade hardware, that is, for the "GPU poor." In this +study, we analyze the performance and efficiency of open-source, small-scale +GLMs for ATS. Results show that GLMs can be fine-tuned to achieve adequate, +though not state-of-the-art, performance. In addition to ATS, we take small +steps towards analyzing models' capacity for generating feedback by prompting +GLMs to explain their scores. Model-generated feedback shows promise, but +requires more rigorous evaluation focused on targeted use cases. + +
+
+ comment: 21 pages, 1 figure +
+
+
+
+
+ + ☆ Let it shine: Autofluorescence of Papanicolaou-stain improves AI-based + cytological oral cancer detection + + +
+ Oral cancer is a global health challenge. It is treatable if detected early, +but it is often fatal in late stages. There is a shift from the invasive and +time-consuming tissue sampling and histological examination, toward +non-invasive brush biopsies and cytological examination. Reliable +computer-assisted methods are essential for cost-effective and accurate +cytological analysis, but the lack of detailed cell-level annotations impairs +model effectiveness. This study aims to improve AI-based oral cancer detection +using multimodal imaging and deep fusion. We combine brightfield and +fluorescence whole slide microscopy imaging to analyze Papanicolaou-stained +liquid-based cytology slides of brush biopsies collected from both healthy and +cancer patients. Due to limited cytological annotations, we utilize a weakly +supervised deep learning approach using only patient-level labels. We evaluate +various multimodal fusion strategies, including early, late, and three recent +intermediate fusion methods. Our results show: (i) fluorescence imaging of +Papanicolaou-stained samples provides substantial diagnostic information; (ii) +multimodal fusion enhances classification and cancer detection accuracy over +single-modality methods. Intermediate fusion is the leading method among the +studied approaches. Specifically, the Co-Attention Fusion Network (CAFNet) +model excels with an F1 score of 83.34% and accuracy of 91.79%, surpassing +human performance on the task. Additional tests highlight the need for precise +image registration to optimize multimodal analysis benefits. This study +advances cytopathology by combining deep learning and multimodal imaging to +enhance early, non-invasive detection of oral cancer, improving diagnostic +accuracy and streamlining clinical workflows. The developed pipeline is also +applicable in other cytological settings. Our codes and dataset are available +online for further research. + +
+
+ comment: 16 pages, 12 figures, 11 tables +
+
+
+
+
+ + ☆ Research on target detection method of distracted driving behavior based + on improved YOLOv8 + + +
+ With the development of deep learning technology, the detection and +classification of distracted driving behaviour requires higher accuracy. +Existing deep learning-based methods are computationally intensive and +parameter redundant, limiting the efficiency and accuracy in practical +applications. To solve this problem, this study proposes an improved YOLOv8 +detection method based on the original YOLOv8 model by integrating the BoTNet +module, GAM attention mechanism and EIoU loss function. By optimising the +feature extraction and multi-scale feature fusion strategies, the training and +inference processes are simplified, and the detection accuracy and efficiency +are significantly improved. Experimental results show that the improved model +performs well in both detection speed and accuracy, with an accuracy rate of +99.4%, and the model is smaller and easy to deploy, which is able to identify +and classify distracted driving behaviours in real time, provide timely +warnings, and enhance driving safety. + +
+
+
+
+
+ + ♻ ☆ ImageFlowNet: Forecasting Multiscale Trajectories of Disease Progression + with Irregularly-Sampled Longitudinal Medical Images + + +
+ The forecasting of disease progression from images is a holy grail for +clinical decision making. However, this task is complicated by the inherent +high dimensionality, temporal sparsity and sampling irregularity in +longitudinal image acquisitions. Existing methods often rely on extracting +hand-crafted features and performing time-series analysis in this vector space, +leading to a loss of rich spatial information within the images. To overcome +these challenges, we introduce ImageFlowNet, a novel framework that learns +latent-space flow fields that evolve multiscale representations in joint +embedding spaces using neural ODEs and SDEs to model disease progression in the +image domain. Notably, ImageFlowNet learns multiscale joint representation +spaces by combining cohorts of patients together so that information can be +transferred between the patient samples. The dynamics then provide plausible +trajectories of progression, with the SDE providing alternative trajectories +from the same starting point. We provide theoretical insights that support our +formulation of ODEs, and motivate our regularizations involving high-level +visual features, latent space organization, and trajectory smoothness. We then +demonstrate ImageFlowNet's effectiveness through empirical evaluations on three +longitudinal medical image datasets depicting progression in retinal geographic +atrophy, multiple sclerosis, and glioblastoma. + +
+
+ comment: Included reference to codebase. Added acknowledgements +
+
+
+
+
+ + ♻ ☆ LtU-ILI: An All-in-One Framework for Implicit Inference in Astrophysics + and Cosmology + + +
+ This paper presents the Learning the Universe Implicit Likelihood Inference +(LtU-ILI) pipeline, a codebase for rapid, user-friendly, and cutting-edge +machine learning (ML) inference in astrophysics and cosmology. The pipeline +includes software for implementing various neural architectures, training +schemata, priors, and density estimators in a manner easily adaptable to any +research workflow. It includes comprehensive validation metrics to assess +posterior estimate coverage, enhancing the reliability of inferred results. +Additionally, the pipeline is easily parallelizable and is designed for +efficient exploration of modeling hyperparameters. To demonstrate its +capabilities, we present real applications across a range of astrophysics and +cosmology problems, such as: estimating galaxy cluster masses from X-ray +photometry; inferring cosmology from matter power spectra and halo point +clouds; characterizing progenitors in gravitational wave signals; capturing +physical dust parameters from galaxy colors and luminosities; and establishing +properties of semi-analytic models of galaxy formation. We also include +exhaustive benchmarking and comparisons of all implemented methods as well as +discussions about the challenges and pitfalls of ML inference in astronomical +sciences. All code and examples are made publicly available at +https://github.com/maho3/ltu-ili. + +
+
+ comment: 22 pages, 10 figures, accepted in the Open Journal of Astrophysics. + Code available at https://github.com/maho3/ltu-ili +
+
+
+
+
+ + ♻ ☆ Sparse Variational Contaminated Noise Gaussian Process Regression with + Applications in Geomagnetic Perturbations Forecasting + + +
+ Gaussian Processes (GP) have become popular machine-learning methods for +kernel-based learning on datasets with complicated covariance structures. In +this paper, we present a novel extension to the GP framework using a +contaminated normal likelihood function to better account for heteroscedastic +variance and outlier noise. We propose a scalable inference algorithm based on +the Sparse Variational Gaussian Process (SVGP) method for fitting sparse +Gaussian process regression models with contaminated normal noise on large +datasets. We examine an application to geomagnetic ground perturbations, where +the state-of-the-art prediction model is based on neural networks. We show that +our approach yields shorter prediction intervals for similar coverage and +accuracy when compared to an artificial dense neural network baseline. + +
+
+
+
+
+ + ♻ ☆ Analytics of Longitudinal System Monitoring Data for Performance + Prediction + + +
+ In recent years, several HPC facilities have started continuous monitoring of +their systems and jobs to collect performance-related data for understanding +performance and operational efficiency. Such data can be used to optimize the +performance of individual jobs and the overall system by creating data-driven +models that can predict the performance of jobs waiting in the scheduler queue. +In this paper, we model the performance of representative control jobs using +longitudinal system-wide monitoring data and machine learning to explore the +causes of performance variability. We analyze these prediction models in great +detail to identify the features that are dominant predictors of performance. We +demonstrate that such models can be application-agnostic and can be used for +predicting performance of applications that are not included in training. + +
+
+
+
+
+ + ♻ ☆ Naturalistic Music Decoding from EEG Data via Latent Diffusion Models + + +
+ In this article, we explore the potential of using latent diffusion models, a +family of powerful generative models, for the task of reconstructing +naturalistic music from electroencephalogram (EEG) recordings. Unlike simpler +music with limited timbres, such as MIDI-generated tunes or monophonic pieces, +the focus here is on intricate music featuring a diverse array of instruments, +voices, and effects, rich in harmonics and timbre. This study represents an +initial foray into achieving general music reconstruction of high-quality using +non-invasive EEG data, employing an end-to-end training approach directly on +raw data without the need for manual pre-processing and channel selection. We +train our models on the public NMED-T dataset and perform quantitative +evaluation proposing neural embedding-based metrics. We additionally perform +song classification based on the generated tracks. Our work contributes to the +ongoing research in neural decoding and brain-computer interfaces, offering +insights into the feasibility of using EEG data for complex auditory +information reconstruction. + +
+
+
+
+
+ + ♻ ☆ Enhancing Deep Neural Network Training Efficiency and Performance + through Linear Prediction + + +
+ Deep neural networks (DNN) have achieved remarkable success in various +fields, including computer vision and natural language processing. However, +training an effective DNN model still poses challenges. This paper aims to +propose a method to optimize the training effectiveness of DNN, with the goal +of improving model performance. Firstly, based on the observation that the DNN +parameters change in certain laws during training process, the potential of +parameter prediction for improving model training efficiency and performance is +discovered. Secondly, considering the magnitude of DNN model parameters, +hardware limitations and characteristics of Stochastic Gradient Descent (SGD) +for noise tolerance, a Parameter Linear Prediction (PLP) method is exploit to +perform DNN parameter prediction. Finally, validations are carried out on some +representative backbones. Experiment results show that compare to the normal +training ways, under the same training conditions and epochs, by employing +proposed PLP method, the optimal model is able to obtain average about 1% +accuracy improvement and 0.01 top-1/top-5 error reduction for Vgg16, Resnet18 +and GoogLeNet based on CIFAR-100 dataset, which shown the effectiveness of the +proposed method on different DNN structures, and validated its capacity in +enhancing DNN training efficiency and performance. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot ECG Classification with Multimodal Learning and Test-time + Clinical Knowledge Enhancement ICML2024 + + +
+ Electrocardiograms (ECGs) are non-invasive diagnostic tools crucial for +detecting cardiac arrhythmic diseases in clinical practice. While ECG +Self-supervised Learning (eSSL) methods show promise in representation learning +from unannotated ECG data, they often overlook the clinical knowledge that can +be found in reports. This oversight and the requirement for annotated samples +for downstream tasks limit eSSL's versatility. In this work, we address these +issues with the Multimodal ECG Representation Learning (MERL}) framework. +Through multimodal learning on ECG records and associated reports, MERL is +capable of performing zero-shot ECG classification with text prompts, +eliminating the need for training data in downstream tasks. At test time, we +propose the Clinical Knowledge Enhanced Prompt Engineering (CKEPE) approach, +which uses Large Language Models (LLMs) to exploit external expert-verified +clinical knowledge databases, generating more descriptive prompts and reducing +hallucinations in LLM-generated content to boost zero-shot classification. +Based on MERL, we perform the first benchmark across six public ECG datasets, +showing the superior performance of MERL compared against eSSL methods. +Notably, MERL achieves an average AUC score of 75.2% in zero-shot +classification (without training data), 3.2% higher than linear probed eSSL +methods with 10\% annotated training data, averaged across all six datasets. +Code and models are available at https://github.com/cheliu-computation/MERL + +
+
+ comment: Accepted by ICML2024 +
+
+
+
+
+ + ♻ ☆ On the consistency of hyper-parameter selection in value-based deep + reinforcement learning + + +
+ Deep reinforcement learning (deep RL) has achieved tremendous success on +various domains through a combination of algorithmic design and careful +selection of hyper-parameters. Algorithmic improvements are often the result of +iterative enhancements built upon prior approaches, while hyper-parameter +choices are typically inherited from previous methods or fine-tuned +specifically for the proposed technique. Despite their crucial impact on +performance, hyper-parameter choices are frequently overshadowed by algorithmic +advancements. This paper conducts an extensive empirical study focusing on the +reliability of hyper-parameter selection for value-based deep reinforcement +learning agents, including the introduction of a new score to quantify the +consistency and reliability of various hyper-parameters. Our findings not only +help establish which hyper-parameters are most critical to tune, but also help +clarify which tunings remain consistent across different training regimes. + +
+
+
+
+
+ + ♻ ☆ Efficient Evolutionary Search Over Chemical Space with Large Language + Models + + +
+ Molecular discovery, when formulated as an optimization problem, presents +significant computational challenges because optimization objectives can be +non-differentiable. Evolutionary Algorithms (EAs), often used to optimize +black-box objectives in molecular discovery, traverse chemical space by +performing random mutations and crossovers, leading to a large number of +expensive objective evaluations. In this work, we ameliorate this shortcoming +by incorporating chemistry-aware Large Language Models (LLMs) into EAs. Namely, +we redesign crossover and mutation operations in EAs using LLMs trained on +large corpora of chemical information. We perform extensive empirical studies +on both commercial and open-source models on multiple tasks involving property +optimization, molecular rediscovery, and structure-based drug design, +demonstrating that the joint usage of LLMs with EAs yields superior performance +over all baseline models across single- and multi-objective settings. We +demonstrate that our algorithm improves both the quality of the final solution +and convergence speed, thereby reducing the number of required objective +evaluations. Our code is available at http://github.com/zoom-wang112358/MOLLEO + +
+
+
+
+
+ + ♻ ☆ SINCERE: Supervised Information Noise-Contrastive Estimation REvisited + + +
+ The information noise-contrastive estimation (InfoNCE) loss function provides +the basis of many self-supervised deep learning methods due to its strong +empirical results and theoretic motivation. Previous work suggests a supervised +contrastive (SupCon) loss to extend InfoNCE to learn from available class +labels. This SupCon loss has been widely-used due to reports of good empirical +performance. However, in this work we find that the prior SupCon loss +formulation has questionable justification because it can encourage some images +from the same class to repel one another in the learned embedding space. This +problematic intra-class repulsion gets worse as the number of images sharing +one class label increases. We propose the Supervised InfoNCE REvisited +(SINCERE) loss as a theoretically-justified supervised extension of InfoNCE +that eliminates intra-class repulsion. Experiments show that SINCERE leads to +better separation of embeddings from different classes and improves transfer +learning classification accuracy. We additionally utilize probabilistic +modeling to derive an information-theoretic bound that relates SINCERE loss to +the symmeterized KL divergence between data-generating distributions for a +target class and all other classes. + +
+
+
+
+
+ + ♻ ☆ Homomorphism Autoencoder -- Learning Group Structured Representations + from Observed Transitions ICML2023 + + +
+ How can agents learn internal models that veridically represent interactions +with the real world is a largely open question. As machine learning is moving +towards representations containing not just observational but also +interventional knowledge, we study this problem using tools from representation +learning and group theory. We propose methods enabling an agent acting upon the +world to learn internal representations of sensory information that are +consistent with actions that modify it. We use an autoencoder equipped with a +group representation acting on its latent space, trained using an +equivariance-derived loss in order to enforce a suitable homomorphism property +on the group representation. In contrast to existing work, our approach does +not require prior knowledge of the group and does not restrict the set of +actions the agent can perform. We motivate our method theoretically, and show +empirically that it can learn a group representation of the actions, thereby +capturing the structure of the set of transformations applied to the +environment. We further show that this allows agents to predict the effect of +sequences of future actions with improved accuracy. + +
+
+ comment: Accepted at ICML2023, Presented at the Symmetry and Geometry in + Neural Representations Workshop (NeurReps) @ NeurIPS2022, 26 pages, 17 + figures +
+
+
+
+
+ + ♻ ☆ Diffusion Forcing: Next-token Prediction Meets Full-Sequence Diffusion + + +
+ This paper presents Diffusion Forcing, a new training paradigm where a +diffusion model is trained to denoise a set of tokens with independent +per-token noise levels. We apply Diffusion Forcing to sequence generative +modeling by training a causal next-token prediction model to generate one or +several future tokens without fully diffusing past ones. Our approach is shown +to combine the strengths of next-token prediction models, such as +variable-length generation, with the strengths of full-sequence diffusion +models, such as the ability to guide sampling to desirable trajectories. Our +method offers a range of additional capabilities, such as (1) rolling-out +sequences of continuous tokens, such as video, with lengths past the training +horizon, where baselines diverge and (2) new sampling and guiding schemes that +uniquely profit from Diffusion Forcing's variable-horizon and causal +architecture, and which lead to marked performance gains in decision-making and +planning tasks. In addition to its empirical success, our method is proven to +optimize a variational lower bound on the likelihoods of all subsequences of +tokens drawn from the true joint distribution. Project website: +https://boyuan.space/diffusion-forcing/ + +
+
+ comment: Project website: https://boyuan.space/diffusion-forcing/ +
+
+
+
+
+ + ♻ ☆ Reproducibility in Machine Learning-based Research: Overview, Barriers + and Drivers + + +
+ Research in various fields is currently experiencing challenges regarding the +reproducibility of results. This problem is also prevalent in machine learning +(ML) research. The issue arises, for example, due to unpublished data and/or +source code and the sensitivity of ML training conditions. Although different +solutions have been proposed to address this issue, such as using ML platforms, +the level of reproducibility in ML-driven research remains unsatisfactory. +Therefore, in this article, we discuss the reproducibility of ML-driven +research with three main aims: (i) identifying the barriers to reproducibility +when applying ML in research as well as categorize the barriers to different +types of reproducibility (description, code, data, and experiment +reproducibility), (ii) discussing potential drivers such as tools, practices, +and interventions that support ML reproducibility, as well as distinguish +between technology-driven drivers, procedural drivers, and drivers related to +awareness and education, and (iii) mapping the drivers to the barriers. With +this work, we hope to provide insights and to contribute to the decision-making +process regarding the adoption of different solutions to support ML +reproducibility. + +
+
+ comment: Pre-print of submission for the AI Magazine - comments to this + pre-print are very welcome +
+
+
+
+
+ + ♻ ☆ Contractual Reinforcement Learning: Pulling Arms with Invisible Hands + + +
+ The agency problem emerges in today's large scale machine learning tasks, +where the learners are unable to direct content creation or enforce data +collection. In this work, we propose a theoretical framework for aligning +economic interests of different stakeholders in the online learning problems +through contract design. The problem, termed \emph{contractual reinforcement +learning}, naturally arises from the classic model of Markov decision +processes, where a learning principal seeks to optimally influence the agent's +action policy for their common interests through a set of payment rules +contingent on the realization of next state. For the planning problem, we +design an efficient dynamic programming algorithm to determine the optimal +contracts against the far-sighted agent. For the learning problem, we introduce +a generic design of no-regret learning algorithms to untangle the challenges +from robust design of contracts to the balance of exploration and exploitation, +reducing the complexity analysis to the construction of efficient search +algorithms. For several natural classes of problems, we design tailored search +algorithms that provably achieve $\tilde{O}(\sqrt{T})$ regret. We also present +an algorithm with $\tilde{O}(T^{2/3})$ for the general problem that improves +the existing analysis in online contract design with mild technical +assumptions. + +
+
+
+
+
+ + ♻ ☆ Observational Scaling Laws and the Predictability of Language Model + Performance + + +
+ Understanding how language model performance varies with scale is critical to +benchmark and algorithm development. Scaling laws are one approach to building +this understanding, but the requirement of training models across many +different scales has limited their use. We propose an alternative, +observational approach that bypasses model training and instead builds scaling +laws from ~80 publically available models. Building a single scaling law from +multiple model families is challenging due to large variations in their +training compute efficiencies and capabilities. However, we show that these +variations are consistent with a simple, generalized scaling law where language +model performance is a function of a low-dimensional capability space, and +model families only vary in their efficiency in converting training compute to +capabilities. Using this approach, we show the surprising predictability of +complex scaling phenomena: we show that several emergent phenomena follow a +smooth, sigmoidal behavior and are predictable from small models; we show that +the agent performance of models such as GPT-4 can be precisely predicted from +simpler non-agentic benchmarks; and we show how to predict the impact of +post-training interventions like Chain-of-Thought and Self-Consistency as +language model capabilities continue to improve. + +
+
+
+
+
+ + ♻ ☆ Characteristic Learning for Provable One Step Generation + + +
+ We propose the characteristic generator, a novel one-step generative model +that combines the efficiency of sampling in Generative Adversarial Networks +(GANs) with the stable performance of flow-based models. Our model is driven by +characteristics, along which the probability density transport can be described +by ordinary differential equations (ODEs). Specifically, We estimate the +velocity field through nonparametric regression and utilize Euler method to +solve the probability flow ODE, generating a series of discrete approximations +to the characteristics. We then use a deep neural network to fit these +characteristics, ensuring a one-step mapping that effectively pushes the prior +distribution towards the target distribution. In the theoretical aspect, we +analyze the errors in velocity matching, Euler discretization, and +characteristic fitting to establish a non-asymptotic convergence rate for the +characteristic generator in 2-Wasserstein distance. To the best of our +knowledge, this is the first thorough analysis for simulation-free one step +generative models. Additionally, our analysis refines the error analysis of +flow-based generative models in prior works. We apply our method on both +synthetic and real datasets, and the results demonstrate that the +characteristic generator achieves high generation quality with just a single +evaluation of neural network. + +
+
+
+
+
+ + ♻ ☆ Enabling Large Batch Size Training for DNN Models Beyond the Memory + Limit While Maintaining Performance + + +
+ Recent deep learning models are difficult to train using a large batch size, +because commodity machines may not have enough memory to accommodate both the +model and a large data batch size. The batch size is one of the +hyper-parameters used in the training model, and it is dependent on and is +limited by the target machine memory capacity because the batch size can only +fit into the remaining memory after the model is uploaded. Moreover, the data +item size is also an important factor because if each data item size is larger +then the batch size that can fit into the remaining memory becomes smaller. +This paper proposes a method called Micro-Batch Processing (MBP) to address +this problem. This method helps deep learning models to train by providing a +batch processing method that splits a batch into a size that can fit in the +remaining memory and processes them sequentially. After processing the small +batches individually, a loss normalization algorithm based on the gradient +accumulation is used to maintain the performance. The purpose of our method is +to allow deep learning models to train using larger batch sizes that exceed the +memory capacity of a system without increasing the memory size or using +multiple devices (GPUs). + +
+
+ comment: Published in IEEE Access +
+
+
+
+
+ + ♻ ☆ Structure-based drug design by denoising voxel grids + + +
+ We present VoxBind, a new score-based generative model for 3D molecules +conditioned on protein structures. Our approach represents molecules as 3D +atomic density grids and leverages a 3D voxel-denoising network for learning +and generation. We extend the neural empirical Bayes formalism (Saremi & +Hyvarinen, 2019) to the conditional setting and generate structure-conditioned +molecules with a two-step procedure: (i) sample noisy molecules from the +Gaussian-smoothed conditional distribution with underdamped Langevin MCMC using +the learned score function and (ii) estimate clean molecules from the noisy +samples with single-step denoising. Compared to the current state of the art, +our model is simpler to train, significantly faster to sample from, and +achieves better results on extensive in silico benchmarks -- the generated +molecules are more diverse, exhibit fewer steric clashes, and bind with higher +affinity to protein pockets. The code is available at +https://github.com/genentech/voxbind/. + +
+
+
+
+
+ + ♻ ☆ Embedded FPGA Developments in 130nm and 28nm CMOS for Machine Learning + in Particle Detector Readout + + +
+ Embedded field programmable gate array (eFPGA) technology allows the +implementation of reconfigurable logic within the design of an +application-specific integrated circuit (ASIC). This approach offers the low +power and efficiency of an ASIC along with the ease of FPGA configuration, +particularly beneficial for the use case of machine learning in the data +pipeline of next-generation collider experiments. An open-source framework +called "FABulous" was used to design eFPGAs using 130 nm and 28 nm CMOS +technology nodes, which were subsequently fabricated and verified through +testing. The capability of an eFPGA to act as a front-end readout chip was +assessed using simulation of high energy particles passing through a silicon +pixel sensor. A machine learning-based classifier, designed for reduction of +sensor data at the source, was synthesized and configured onto the eFPGA. A +successful proof-of-concept was demonstrated through reproduction of the +expected algorithm result on the eFPGA with perfect accuracy. Further +development of the eFPGA technology and its application to collider detector +readout is discussed. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Type-II Saddles and Probabilistic Stability of Stochastic Gradient + Descent + + +
+ Characterizing and understanding the dynamics of stochastic gradient descent +(SGD) around saddle points remains an open problem. We first show that saddle +points in neural networks can be divided into two types, among which the +Type-II saddles are especially difficult to escape from because the gradient +noise vanishes at the saddle. The dynamics of SGD around these saddles are thus +to leading order described by a random matrix product process, and it is thus +natural to study the dynamics of SGD around these saddles using the notion of +probabilistic stability and the related Lyapunov exponent. Theoretically, we +link the study of SGD dynamics to well-known concepts in ergodic theory, which +we leverage to show that saddle points can be either attractive or repulsive +for SGD, and its dynamics can be classified into four different phases, +depending on the signal-to-noise ratio in the gradient close to the saddle. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Structured Partial Stochasticity in Bayesian Neural Networks + + +
+ Bayesian neural network posterior distributions have a great number of modes +that correspond to the same network function. The abundance of such modes can +make it difficult for approximate inference methods to do their job. Recent +work has demonstrated the benefits of partial stochasticity for approximate +inference in Bayesian neural networks; inference can be less costly and +performance can sometimes be improved. I propose a structured way to select the +deterministic subset of weights that removes neuron permutation symmetries, and +therefore the corresponding redundant posterior modes. With a drastically +simplified posterior distribution, the performance of existing approximate +inference schemes is found to be greatly improved. + +
+
+ comment: Accepted at 6th Symposium on Advances in Approximate Bayesian + Inference (non-archival track) +
+
+
+
+
+ + ♻ ☆ Open-Source Conversational AI with SpeechBrain 1.0 + + +
+ SpeechBrain is an open-source Conversational AI toolkit based on PyTorch, +focused particularly on speech processing tasks such as speech recognition, +speech enhancement, speaker recognition, text-to-speech, and much more. It +promotes transparency and replicability by releasing both the pre-trained +models and the complete "recipes" of code and algorithms required for training +them. This paper presents SpeechBrain 1.0, a significant milestone in the +evolution of the toolkit, which now has over 200 recipes for speech, audio, and +language processing tasks, and more than 100 models available on Hugging Face. +SpeechBrain 1.0 introduces new technologies to support diverse learning +modalities, Large Language Model (LLM) integration, and advanced decoding +strategies, along with novel models, tasks, and modalities. It also includes a +new benchmark repository, offering researchers a unified platform for +evaluating models across diverse tasks + +
+
+ comment: Submitted to JMLR (Machine Learning Open Source Software) +
+
+
+
+
+ + ♻ ☆ Logistics Hub Location Optimization: A K-Means and P-Median Model Hybrid + Approach Using Road Network Distances + + +
+ Logistic hubs play a pivotal role in the last-mile delivery distance; even a +slight increment in distance negatively impacts the business of the e-commerce +industry while also increasing its carbon footprint. The growth of this +industry, particularly after Covid-19, has further intensified the need for +optimized allocation of resources in an urban environment. In this study, we +use a hybrid approach to optimize the placement of logistic hubs. The approach +sequentially employs different techniques. Initially, delivery points are +clustered using K-Means in relation to their spatial locations. The clustering +method utilizes road network distances as opposed to Euclidean distances. +Non-road network-based approaches have been avoided since they lead to +erroneous and misleading results. Finally, hubs are located using the P-Median +method. The P-Median method also incorporates the number of deliveries and +population as weights. Real-world delivery data from Muller and Phipps (M&P) is +used to demonstrate the effectiveness of the approach. Serving deliveries from +the optimal hub locations results in the saving of 815 (10%) meters per +delivery. + +
+
+
+
+
+ + ♻ ☆ Vision-LSTM: xLSTM as Generic Vision Backbone + + +
+ Transformers are widely used as generic backbones in computer vision, despite +initially introduced for natural language processing. Recently, the Long +Short-Term Memory (LSTM) has been extended to a scalable and performant +architecture - the xLSTM - which overcomes long-standing LSTM limitations via +exponential gating and parallelizable matrix memory structure. In this report, +we introduce Vision-LSTM (ViL), an adaption of the xLSTM building blocks to +computer vision. ViL comprises a stack of xLSTM blocks where odd blocks process +the sequence of patch tokens from top to bottom while even blocks go from +bottom to top. Experiments show that ViL holds promise to be further deployed +as new generic backbone for computer vision architectures. + +
+
+
+
+
+ + ♻ ☆ A Survey on Popularity Bias in Recommender Systems + + +
+ Recommender systems help people find relevant content in a personalized way. +One main promise of such systems is that they are able to increase the +visibility of items in the long tail, i.e., the lesser-known items in a +catalogue. Existing research, however, suggests that in many situations todays +recommendation algorithms instead exhibit a popularity bias, meaning that they +often focus on rather popular items in their recommendations. Such a bias may +not only lead to the limited value of the recommendations for consumers and +providers in the short run, but it may also cause undesired reinforcement +effects over time. In this paper, we discuss the potential reasons for +popularity bias and review existing approaches to detect, quantify and mitigate +popularity bias in recommender systems. Our survey, therefore, includes both an +overview of the computational metrics used in the literature as well as a +review of the main technical approaches to reduce the bias. Furthermore, we +critically discuss todays literature, where we observe that the research is +almost entirely based on computational experiments and on certain assumptions +regarding the practical effects of including long-tail items in the +recommendations. + +
+
+
+
+
+ + ♻ ☆ Fusing Audio and Metadata Embeddings Improves Language-based Audio + Retrieval + + +
+ Matching raw audio signals with textual descriptions requires understanding +the audio's content and the description's semantics and then drawing +connections between the two modalities. This paper investigates a hybrid +retrieval system that utilizes audio metadata as an additional clue to +understand the content of audio signals before matching them with textual +queries. We experimented with metadata often attached to audio recordings, such +as keywords and natural-language descriptions, and we investigated late and +mid-level fusion strategies to merge audio and metadata. Our hybrid approach +with keyword metadata and late fusion improved the retrieval performance over a +content-based baseline by 2.36 and 3.69 pp. mAP@10 on the ClothoV2 and +AudioCaps benchmarks, respectively. + +
+
+ comment: In Proceedings of the 32nd European Signal Processing Conference, + EUSIPCO 2024 +
+
+
+
+
+ + ♻ ☆ Selective Pre-training for Private Fine-tuning + + +
+ Text prediction models, when used in applications like email clients or word +processors, must protect user data privacy and adhere to model size +constraints. These constraints are crucial to meet memory and inference time +requirements, as well as to reduce inference costs. Building small, fast, and +private domain-specific language models is a thriving area of research. In this +work, we show that a careful pre-training on a \emph{subset} of the public +dataset that is guided by the private dataset is crucial to train small +language models with differential privacy. On standard benchmarks, small models +trained with our new framework achieve state-of-the-art performance. In +addition to performance improvements, our results demonstrate that smaller +models, through careful pre-training and private fine-tuning, can match the +performance of much larger models that do not have access to private data. This +underscores the potential of private learning for model compression and +enhanced efficiency. + +
+
+ comment: Transactions on Machine Learning Research. Code available at + https://github.com/dayu11/selective_pretraining_for_private_finetuning +
+
+
+
+
+ + ♻ ☆ Deep Imbalanced Regression to Estimate Vascular Age from PPG Data: a + Novel Digital Biomarker for Cardiovascular Health + + +
+ Photoplethysmography (PPG) is emerging as a crucial tool for monitoring human +hemodynamics, with recent studies highlighting its potential in assessing +vascular aging through deep learning. However, real-world age distributions are +often imbalanced, posing significant challenges for deep learning models. In +this paper, we introduce a novel, simple, and effective loss function named the +Dist Loss to address deep imbalanced regression tasks. We trained a +one-dimensional convolutional neural network (Net1D) incorporating the Dist +Loss on the extensive UK Biobank dataset (n=502,389) to estimate vascular age +from PPG signals and validate its efficacy in characterizing cardiovascular +health. The model's performance was validated on a 40% held-out test set, +achieving state-of-the-art results, especially in regions with small sample +sizes. Furthermore, we divided the population into three subgroups based on the +difference between predicted vascular age and chronological age: less than -10 +years, between -10 and 10 years, and greater than 10 years. We analyzed the +relationship between predicted vascular age and several cardiovascular events +over a follow-up period of up to 10 years, including death, coronary heart +disease, and heart failure. Our results indicate that the predicted vascular +age has significant potential to reflect an individual's cardiovascular health +status. Our code will be available at https://github.com/Ngk03/AI-vascular-age. + +
+
+
+
+
+ + ♻ ☆ An Interpretable Alternative to Neural Representation Learning for + Rating Prediction -- Transparent Latent Class Modeling of User Reviews + + +
+ Nowadays, neural network (NN) and deep learning (DL) techniques are widely +adopted in many applications, including recommender systems. Given the sparse +and stochastic nature of collaborative filtering (CF) data, recent works have +critically analyzed the effective improvement of neural-based approaches +compared to simpler and often transparent algorithms for recommendation. +Previous results showed that NN and DL models can be outperformed by +traditional algorithms in many tasks. Moreover, given the largely black-box +nature of neural-based methods, interpretable results are not naturally +obtained. Following on this debate, we first present a transparent +probabilistic model that topologically organizes user and product latent +classes based on the review information. In contrast to popular neural +techniques for representation learning, we readily obtain a statistical, +visualization-friendly tool that can be easily inspected to understand user and +product characteristics from a textual-based perspective. Then, given the +limitations of common embedding techniques, we investigate the possibility of +using the estimated interpretable quantities as model input for a rating +prediction task. To contribute to the recent debates, we evaluate our results +in terms of both capacity for interpretability and predictive performances in +comparison with popular text-based neural approaches. The results demonstrate +that the proposed latent class representations can yield competitive predictive +performances, compared to popular, but difficult-to-interpret approaches. + +
+
+
+
+
+ + ♻ ☆ Task-Synchronized Recurrent Neural Networks + + +
+ Data are often sampled irregularly in time. Dealing with this using Recurrent +Neural Networks (RNNs) traditionally involved ignoring the fact, feeding the +time differences as additional inputs, or resampling the data. All these +methods have their shortcomings. We propose an elegant straightforward +alternative approach where instead the RNN is in effect resampled in time to +match the time of the data or the task at hand. We use Echo State Network (ESN) +and Gated Recurrent Unit (GRU) as the basis for our solution. Such RNNs can be +seen as discretizations of continuous-time dynamical systems, which gives a +solid theoretical ground to our approach. Our Task-Synchronized ESN (TSESN) and +GRU (TSGRU) models allow for a direct model time setting and require no +additional training, parameter tuning, or computation (solving differential +equations or interpolating data) compared to their regular counterparts, thus +retaining their original efficiency. We confirm empirically that our models can +effectively compensate for the time-non-uniformity of the data and demonstrate +that they compare favorably to data resampling, classical RNN methods, and +alternative RNN models proposed to deal with time irregularities on several +real-world nonuniform-time datasets. We open-source the code at +https://github.com/oshapio/task-synchronized-RNNs . + +
+
+ comment: The 1st version was written in May 2019 and double-blind reviewed for + a prominent conference. A major update. We changed the name of the article + and methods to an arguably more precise one, and because a very similar title + has been published in the meantime. We've rewritten much of the text, + connected to the current literature, redone some experiments, figures, + discussion, published source code +
+
+
+
+
+ + ♻ ☆ Explaining Deep Learning for ECG Analysis: Building Blocks for Auditing + and Knowledge Discovery + + +
+ Deep neural networks have become increasingly popular for analyzing ECG data +because of their ability to accurately identify cardiac conditions and hidden +clinical factors. However, the lack of transparency due to the black box nature +of these models is a common concern. To address this issue, explainable AI +(XAI) methods can be employed. In this study, we present a comprehensive +analysis of post-hoc XAI methods, investigating the local (attributions per +sample) and global (based on domain expert concepts) perspectives. We have +established a set of sanity checks to identify sensible attribution methods, +and we provide quantitative evidence in accordance with expert rules. This +dataset-wide analysis goes beyond anecdotal evidence by aggregating data across +patient subgroups. Furthermore, we demonstrate how these XAI techniques can be +utilized for knowledge discovery, such as identifying subtypes of myocardial +infarction. We believe that these proposed methods can serve as building blocks +for a complementary assessment of the internal validity during a certification +process, as well as for knowledge discovery in the field of ECG analysis. + +
+
+
+
+
+ + ♻ ☆ Are There Exceptions to Goodhart's Law? On the Moral Justification of + Fairness-Aware Machine Learning + + +
+ Fairness-aware machine learning (fair-ml) techniques are algorithmic +interventions designed to ensure that individuals who are affected by the +predictions of a machine learning model are treated fairly. The problem is +often posed as an optimization problem, where the objective is to achieve high +predictive performance under a quantitative fairness constraint. However, any +attempt to design a fair-ml algorithm must assume a world where Goodhart's law +has an exception: when a fairness measure becomes an optimization constraint, +it does not cease to be a good measure. In this paper, we argue that fairness +measures are particularly sensitive to Goodhart's law. Our main contributions +are as follows. First, we present a framework for moral reasoning about the +justification of fairness metrics. In contrast to existing work, our framework +incorporates the belief that whether a distribution of outcomes is fair, +depends not only on the cause of inequalities but also on what moral claims +decision subjects have to receive a particular benefit or avoid a burden. We +use the framework to distil moral and empirical assumptions under which +particular fairness metrics correspond to a fair distribution of outcomes. +Second, we explore the extent to which employing fairness metrics as a +constraint in a fair-ml algorithm is morally justifiable, exemplified by the +fair-ml algorithm introduced by Hardt et al. (2016). We illustrate that +enforcing a fairness metric through a fair-ml algorithm often does not result +in the fair distribution of outcomes that motivated its use and can even harm +the individuals the intervention was intended to protect. + +
+
+
+
+
+ + ♻ ☆ Estimation of AMOC transition probabilities using a machine learning + based rare-event algorithm + + +
+ The Atlantic Meridional Overturning Circulation (AMOC) is an important +component of the global climate, known to be a tipping element, as it could +collapse under global warming. The main objective of this study is to compute +the probability that the AMOC collapses within a specified time window, using a +rare-event algorithm called Trajectory-Adaptive Multilevel Splitting (TAMS). +However, the efficiency and accuracy of TAMS depend on the choice of the score +function. Although the definition of the optimal score function, called +``committor function" is known, it is impossible in general to compute it a +priori. Here, we combine TAMS with a Next-Generation Reservoir Computing +technique that estimates the committor function from the data generated by the +rare-event algorithm. We test this technique in a stochastic box model of the +AMOC for which two types of transition exist, the so-called F(ast)-transitions +and S(low)-transitions. Results for the F-transtions compare favorably with +those in the literature where a physically-informed score function was used. We +show that coupling a rare-event algorithm with machine learning allows for a +correct estimation of transition probabilities, transition times, and even +transition paths for a wide range of model parameters. We then extend these +results to the more difficult problem of S-transitions in the same model. In +both cases of F-transitions and S-transitions, we also show how the +Next-Generation Reservoir Computing technique can be interpreted to retrieve an +analytical estimate of the committor function. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Towards Robust Cardiac Segmentation using Graph Convolutional Networks + + +
+ Fully automatic cardiac segmentation can be a fast and reproducible method to +extract clinical measurements from an echocardiography examination. The U-Net +architecture is the current state-of-the-art deep learning architecture for +medical segmentation and can segment cardiac structures in real-time with +average errors comparable to inter-observer variability. However, this +architecture still generates large outliers that are often anatomically +incorrect. This work uses the concept of graph convolutional neural networks +that predict the contour points of the structures of interest instead of +labeling each pixel. We propose a graph architecture that uses two +convolutional rings based on cardiac anatomy and show that this eliminates +anatomical incorrect multi-structure segmentations on the publicly available +CAMUS dataset. Additionally, this work contributes with an ablation study on +the graph convolutional architecture and an evaluation of clinical measurements +on the clinical HUNT4 dataset. Finally, we propose to use the inter-model +agreement of the U-Net and the graph network as a predictor of both the input +and segmentation quality. We show this predictor can detect out-of-distribution +and unsuitable input images in real-time. Source code is available online: +https://github.com/gillesvntnu/GCN_multistructure + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ QOG:Question and Options Generation based on Language Model + + +
+ Question-Options Generation (QOG) is a task that involves generating a set of +question-options pairs given context. This task has various applications, +including fine-tuning large models, information retrieval, and automated +multiple-choice question generation for education. In this paper, we develop +QOG models using three different methods based on fine-tuning +sequence-to-sequence language models (LMs). Experiments demonstrate that the +end-to-end QOG model is computationally efficient and stable during both +training and inference, outperforming other methods. Furthermore, our analysis +indicates that our QOG models are competitive on the QOG task compared to the +large language model Llama 3-8B. + +
+
+
+
+
+ + ♻ ☆ Tracking Object Positions in Reinforcement Learning: A Metric for + Keypoint Detection (extended version) + + +
+ Reinforcement learning (RL) for robot control typically requires a detailed +representation of the environment state, including information about +task-relevant objects not directly measurable. Keypoint detectors, such as +spatial autoencoders (SAEs), are a common approach to extracting a +low-dimensional representation from high-dimensional image data. SAEs aim at +spatial features such as object positions, which are often useful +representations in robotic RL. However, whether an SAE is actually able to +track objects in the scene and thus yields a spatial state representation well +suited for RL tasks has rarely been examined due to a lack of established +metrics. In this paper, we propose to assess the performance of an SAE instance +by measuring how well keypoints track ground truth objects in images. We +present a computationally lightweight metric and use it to evaluate common +baseline SAE architectures on image data from a simulated robot task. We find +that common SAEs differ substantially in their spatial extraction capability. +Furthermore, we validate that SAEs that perform well in our metric achieve +superior performance when used in downstream RL. Thus, our metric is an +effective and lightweight indicator of RL performance before executing +expensive RL training. Building on these insights, we identify three key +modifications of SAE architectures to improve tracking performance. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ USP: A Unified Sequence Parallelism Approach for Long Context Generative + AI + + +
+ Sequence parallelism (SP), which divides the sequence dimension of input +tensors across multiple computational devices, is becoming key to unlocking the +long-context capabilities of generative AI models. This paper investigates the +state-of-the-art SP approaches, i.e. DeepSpeed-Ulysses and Ring-Attention, and +proposes a unified SP approach, which is more robust to transformer model +architectures and network hardware topology. This paper compares the +communication and memory cost of SP and existing parallelism, including +data/tensor/zero/pipeline parallelism, and discusses the best practices for +designing hybrid 4D parallelism involving SP. We achieved 47% MFU on two 8xA800 +nodes using SP for the LLAMA3-8B model training using sequence length 208K. Our +code is publicly available at +https://github.com/feifeibear/long-context-attention. + +
+
+
+
+
+ + ♻ ☆ Adversarial Search Engine Optimization for Large Language Models + + +
+ Large Language Models (LLMs) are increasingly used in applications where the +model selects from competing third-party content, such as in LLM-powered search +engines or chatbot plugins. In this paper, we introduce Preference Manipulation +Attacks, a new class of attacks that manipulate an LLM's selections to favor +the attacker. We demonstrate that carefully crafted website content or plugin +documentations can trick an LLM to promote the attacker products and discredit +competitors, thereby increasing user traffic and monetization. We show this +leads to a prisoner's dilemma, where all parties are incentivized to launch +attacks, but the collective effect degrades the LLM's outputs for everyone. We +demonstrate our attacks on production LLM search engines (Bing and Perplexity) +and plugin APIs (for GPT-4 and Claude). As LLMs are increasingly used to rank +third-party content, we expect Preference Manipulation Attacks to emerge as a +significant threat. + +
+
+
+
+
+ + ♻ ☆ A Closer Look at Classification Evaluation Metrics and a Critical + Reflection of Common Evaluation Practice ACL + + +
+ Classification systems are evaluated in a countless number of papers. +However, we find that evaluation practice is often nebulous. Frequently, +metrics are selected without arguments, and blurry terminology invites +misconceptions. For instance, many works use so-called 'macro' metrics to rank +systems (e.g., 'macro F1') but do not clearly specify what they would expect +from such a `macro' metric. This is problematic, since picking a metric can +affect research findings, and thus any clarity in the process should be +maximized. + Starting from the intuitive concepts of bias and prevalence, we perform an +analysis of common evaluation metrics. The analysis helps us understand the +metrics' underlying properties, and how they align with expectations as found +expressed in papers. Then we reflect on the practical situation in the field, +and survey evaluation practice in recent shared tasks. We find that metric +selection is often not supported with convincing arguments, an issue that can +make a system ranking seem arbitrary. Our work aims at providing overview and +guidance for more informed and transparent metric selection, fostering +meaningful evaluation. + +
+
+ comment: appeared in TACL journal. MIT press publication available at + https://doi.org/10.1162/tacl_a_00675 +
+
+
+
+
+ + ♻ ☆ Koopman Spectrum Nonlinear Regulators and Efficient Online Learning + + +
+ Most modern reinforcement learning algorithms optimize a cumulative +single-step cost along a trajectory. The optimized motions are often +'unnatural', representing, for example, behaviors with sudden accelerations +that waste energy and lack predictability. In this work, we present a novel +paradigm of controlling nonlinear systems via the minimization of the Koopman +spectrum cost: a cost over the Koopman operator of the controlled dynamics. +This induces a broader class of dynamical behaviors that evolve over stable +manifolds such as nonlinear oscillators, closed loops, and smooth movements. We +demonstrate that some dynamics characterizations that are not possible with a +cumulative cost are feasible in this paradigm, which generalizes the classical +eigenstructure and pole assignments to nonlinear decision making. Moreover, we +present a sample efficient online learning algorithm for our problem that +enjoys a sub-linear regret bound under some structural assumptions. + +
+
+ comment: 41 pages, 21 figures +
+
+
+
+
+ + ♻ ☆ Systematic Literature Review on Application of Learning-based Approaches + in Continuous Integration + + +
+ Context: Machine learning (ML) and deep learning (DL) analyze raw data to +extract valuable insights in specific phases. The rise of continuous practices +in software projects emphasizes automating Continuous Integration (CI) with +these learning-based methods, while the growing adoption of such approaches +underscores the need for systematizing knowledge. Objective: Our objective is +to comprehensively review and analyze existing literature concerning +learning-based methods within the CI domain. We endeavour to identify and +analyse various techniques documented in the literature, emphasizing the +fundamental attributes of training phases within learning-based solutions in +the context of CI. Method: We conducted a Systematic Literature Review (SLR) +involving 52 primary studies. Through statistical and thematic analyses, we +explored the correlations between CI tasks and the training phases of +learning-based methodologies across the selected studies, encompassing a +spectrum from data engineering techniques to evaluation metrics. Results: This +paper presents an analysis of the automation of CI tasks utilizing +learning-based methods. We identify and analyze nine types of data sources, +four steps in data preparation, four feature types, nine subsets of data +features, five approaches for hyperparameter selection and tuning, and fifteen +evaluation metrics. Furthermore, we discuss the latest techniques employed, +existing gaps in CI task automation, and the characteristics of the utilized +learning-based techniques. Conclusion: This study provides a comprehensive +overview of learning-based methods in CI, offering valuable insights for +researchers and practitioners developing CI task automation. It also highlights +the need for further research to advance these methods in CI. + +
+
+ comment: This paper has been accepted to be published in IEEE Access +
+
+
+
+
+ + ♻ ☆ Light-weight End-to-End Graph Interest Network for CTR Prediction in + E-commerce Search + + +
+ Click-through-rate (CTR) prediction has an essential impact on improving user +experience and revenue in e-commerce search. With the development of deep +learning, graph-based methods are well exploited to utilize graph structure +extracted from user behaviors and other information to help embedding learning. +However, most of the previous graph-based methods mainly focus on +recommendation scenarios, and therefore their graph structures highly depend on +item's sequential information from user behaviors, ignoring query's sequential +signal and query-item correlation. In this paper, we propose a new approach +named Light-weight End-to-End Graph Interest Network (EGIN) to effectively mine +users' search interests and tackle previous challenges. (i) EGIN utilizes query +and item's correlation and sequential information from the search system to +build a heterogeneous graph for better CTR prediction in e-commerce search. +(ii) EGIN's graph embedding learning shares the same training input and is +jointly trained with CTR prediction, making the end-to-end framework effortless +to deploy in large-scale search systems. The proposed EGIN is composed of three +parts: query-item heterogeneous graph, light-weight graph sampling, and +multi-interest network. The query-item heterogeneous graph captures correlation +and sequential information of query and item efficiently by the proposed +light-weight graph sampling. The multi-interest network is well designed to +utilize graph embedding to capture various similarity relationships between +query and item to enhance the final CTR prediction. We conduct extensive +experiments on both public and industrial datasets to demonstrate the +effectiveness of the proposed EGIN. At the same time, the training cost of +graph learning is relatively low compared with the main CTR prediction task, +ensuring efficiency in practical applications. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Natural Language Can Help Bridge the Sim2Real Gap + + +
+ The main challenge in learning image-conditioned robotic policies is +acquiring a visual representation conducive to low-level control. Due to the +high dimensionality of the image space, learning a good visual representation +requires a considerable amount of visual data. However, when learning in the +real world, data is expensive. Sim2Real is a promising paradigm for overcoming +data scarcity in the real-world target domain by using a simulator to collect +large amounts of cheap data closely related to the target task. However, it is +difficult to transfer an image-conditioned policy from sim to real when the +domains are very visually dissimilar. To bridge the sim2real visual gap, we +propose using natural language descriptions of images as a unifying signal +across domains that captures the underlying task-relevant semantics. Our key +insight is that if two image observations from different domains are labeled +with similar language, the policy should predict similar action distributions +for both images. We demonstrate that training the image encoder to predict the +language description or the distance between descriptions of a sim or real +image serves as a useful, data-efficient pretraining step that helps learn a +domain-invariant image representation. We can then use this image encoder as +the backbone of an IL policy trained simultaneously on a large amount of +simulated and a handful of real demonstrations. Our approach outperforms widely +used prior sim2real methods and strong vision-language pretraining baselines +like CLIP and R3M by 25 to 40%. See additional videos and materials at +https://robin-lab.cs.utexas.edu/lang4sim2real/. + +
+
+ comment: To appear in RSS 2024. Project website at + https://robin-lab.cs.utexas.edu/lang4sim2real/ +
+
+
+
+
+ + ♻ ☆ Large Skew-t Copula Models and Asymmetric Dependence in Intraday Equity + Returns + + +
+ Skew-t copula models are attractive for the modeling of financial data +because they allow for asymmetric and extreme tail dependence. We show that the +copula implicit in the skew-t distribution of Azzalini and Capitanio (2003) +allows for a higher level of pairwise asymmetric dependence than two popular +alternative skew-t copulas. Estimation of this copula in high dimensions is +challenging, and we propose a fast and accurate Bayesian variational inference +(VI) approach to do so. The method uses a generative representation of the +skew-t distribution to define an augmented posterior that can be approximated +accurately. A stochastic gradient ascent algorithm is used to solve the +variational optimization. The methodology is used to estimate skew-t factor +copula models with up to 15 factors for intraday returns from 2017 to 2021 on +93 U.S. equities. The copula captures substantial heterogeneity in asymmetric +dependence over equity pairs, in addition to the variability in pairwise +correlations. In a moving window study we show that the asymmetric dependencies +also vary over time, and that intraday predictive densities from the skew-t +copula are more accurate than those from benchmark copula models. Portfolio +selection strategies based on the estimated pairwise asymmetric dependencies +improve performance relative to the index. + +
+
+
+
+
+ + ♻ ☆ Estimating Noisy Class Posterior with Part-level Labels for Noisy Label + Learning CVPR 2024 + + +
+ In noisy label learning, estimating noisy class posteriors plays a +fundamental role for developing consistent classifiers, as it forms the basis +for estimating clean class posteriors and the transition matrix. Existing +methods typically learn noisy class posteriors by training a classification +model with noisy labels. However, when labels are incorrect, these models may +be misled to overemphasize the feature parts that do not reflect the instance +characteristics, resulting in significant errors in estimating noisy class +posteriors. To address this issue, this paper proposes to augment the +supervised information with part-level labels, encouraging the model to focus +on and integrate richer information from various parts. Specifically, our +method first partitions features into distinct parts by cropping instances, +yielding part-level labels associated with these various parts. Subsequently, +we introduce a novel single-to-multiple transition matrix to model the +relationship between the noisy and part-level labels, which incorporates +part-level labels into a classifier-consistent framework. Utilizing this +framework with part-level labels, we can learn the noisy class posteriors more +precisely by guiding the model to integrate information from various parts, +ultimately improving the classification performance. Our method is +theoretically sound, while experiments show that it is empirically effective in +synthetic and real-world noisy benchmarks. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Logarithmic regret bounds for continuous-time average-reward Markov + decision processes + + +
+ We consider reinforcement learning for continuous-time Markov decision +processes (MDPs) in the infinite-horizon, average-reward setting. In contrast +to discrete-time MDPs, a continuous-time process moves to a state and stays +there for a random holding time after an action is taken. With unknown +transition probabilities and rates of exponential holding times, we derive +instance-dependent regret lower bounds that are logarithmic in the time +horizon. Moreover, we design a learning algorithm and establish a finite-time +regret bound that achieves the logarithmic growth rate. Our analysis builds +upon upper confidence reinforcement learning, a delicate estimation of the mean +holding times, and stochastic comparison of point processes. + +
+
+
+
+
+ + ♻ ☆ DynaSemble: Dynamic Ensembling of Textual and Structure-Based Models for + Knowledge Graph Completion ACL 2024 + + +
+ We consider two popular approaches to Knowledge Graph Completion (KGC): +textual models that rely on textual entity descriptions, and structure-based +models that exploit the connectivity structure of the Knowledge Graph (KG). +Preliminary experiments show that these approaches have complementary +strengths: structure-based models perform exceptionally well when the gold +answer is easily reachable from the query head in the KG, while textual models +exploit descriptions to give good performance even when the gold answer is not +easily reachable. In response, we propose DynaSemble, a novel method for +learning query-dependent ensemble weights to combine these approaches by using +the distributions of scores assigned by the models in the ensemble to all +candidate entities. DynaSemble achieves state-of-the-art results on three +standard KGC datasets, with up to 6.8 pt MRR and 8.3 pt Hits@1 gains over the +best baseline model for the WN18RR dataset. + +
+
+ comment: 12 pages, 2 figures, 15 tables Accepted to ACL 2024 +
+
+
+
+
+ + ♻ ☆ Generative AI for Synthetic Data Across Multiple Medical Modalities: A + Systematic Review of Recent Developments and Challenges + + +
+ This paper presents a comprehensive systematic review of generative models +(GANs, VAEs, DMs, and LLMs) used to synthesize various medical data types, +including imaging (dermoscopic, mammographic, ultrasound, CT, MRI, and X-ray), +text, time-series, and tabular data (EHR). Unlike previous narrowly focused +reviews, our study encompasses a broad array of medical data modalities and +explores various generative models. Our search strategy queries databases such +as Scopus, PubMed, and ArXiv, focusing on recent works from January 2021 to +November 2023, excluding reviews and perspectives. This period emphasizes +recent advancements beyond GANs, which have been extensively covered +previously. + The survey reveals insights from three key aspects: (1) Synthesis +applications and purpose of synthesis, (2) generation techniques, and (3) +evaluation methods. It highlights clinically valid synthesis applications, +demonstrating the potential of synthetic data to tackle diverse clinical +requirements. While conditional models incorporating class labels, segmentation +masks and image translations are prevalent, there is a gap in utilizing prior +clinical knowledge and patient-specific context, suggesting a need for more +personalized synthesis approaches and emphasizing the importance of tailoring +generative approaches to the unique characteristics of medical data. +Additionally, there is a significant gap in using synthetic data beyond +augmentation, such as for validation and evaluation of downstream medical AI +models. The survey uncovers that the lack of standardized evaluation +methodologies tailored to medical images is a barrier to clinical application, +underscoring the need for in-depth evaluation approaches, benchmarking, and +comparative studies to promote openness and collaboration. + +
+
+
+
+
+ + ♻ ☆ Farsight: Fostering Responsible AI Awareness During AI Application + Prototyping + + +
+ Prompt-based interfaces for Large Language Models (LLMs) have made +prototyping and building AI-powered applications easier than ever before. +However, identifying potential harms that may arise from AI applications +remains a challenge, particularly during prompt-based prototyping. To address +this, we present Farsight, a novel in situ interactive tool that helps people +identify potential harms from the AI applications they are prototyping. Based +on a user's prompt, Farsight highlights news articles about relevant AI +incidents and allows users to explore and edit LLM-generated use cases, +stakeholders, and harms. We report design insights from a co-design study with +10 AI prototypers and findings from a user study with 42 AI prototypers. After +using Farsight, AI prototypers in our user study are better able to +independently identify potential harms associated with a prompt and find our +tool more useful and usable than existing resources. Their qualitative feedback +also highlights that Farsight encourages them to focus on end-users and think +beyond immediate harms. We discuss these findings and reflect on their +implications for designing AI prototyping experiences that meaningfully engage +with AI harms. Farsight is publicly accessible at: +https://PAIR-code.github.io/farsight. + +
+
+ comment: Accepted to CHI 2024 (Best Paper, Honorable Mention). 40 pages, 19 + figures, 5 tables. For a demo video, see https://youtu.be/BlSFbGkOlHk. For a + live demo, visit https://PAIR-code.github.io/farsight. The source code is + available at https://github.com/PAIR-code/farsight +
+
+
+
+
+ + ♻ ☆ Remote sensing framework for geological mapping via stacked autoencoders + and clustering + + +
+ Supervised machine learning methods for geological mapping via remote sensing +face limitations due to the scarcity of accurately labelled training data that +can be addressed by unsupervised learning, such as dimensionality reduction and +clustering. Dimensionality reduction methods have the potential to play a +crucial role in improving the accuracy of geological maps. Although +conventional dimensionality reduction methods may struggle with nonlinear data, +unsupervised deep learning models such as autoencoders can model non-linear +relationships. Stacked autoencoders feature multiple interconnected layers to +capture hierarchical data representations useful for remote sensing data. This +study presents an unsupervised machine learning-based framework for processing +remote sensing data using stacked autoencoders for dimensionality reduction and +k-means clustering for mapping geological units. We use Landsat 8, ASTER, and +Sentinel-2 datasets to evaluate the framework for geological mapping of the +Mutawintji region in Western New South Wales, Australia. We also compare +stacked autoencoders with principal component analysis and canonical +autoencoders. Our results reveal that the framework produces accurate and +interpretable geological maps, efficiently discriminating rock units. We find +that the accuracy of stacked autoencoders ranges from 86.6 % to 90 %, depending +on the remote sensing data type, which is superior to their counterparts. We +also find that the generated maps align with prior geological knowledge of the +study area while providing novel insights into geological structures. + +
+
+
+
+
+ + ♻ ☆ A Fixed-Parameter Tractable Algorithm for Counting Markov Equivalence + Classes with the same Skeleton + + +
+ Causal DAGs (also known as Bayesian networks) are a popular tool for encoding +conditional dependencies between random variables. In a causal DAG, the random +variables are modeled as vertices in the DAG, and it is stipulated that every +random variable is independent of its ancestors conditioned on its parents. It +is possible, however, for two different causal DAGs on the same set of random +variables to encode exactly the same set of conditional dependencies. Such +causal DAGs are said to be Markov equivalent, and equivalence classes of Markov +equivalent DAGs are known as Markov Equivalent Classes (MECs). Beautiful +combinatorial characterizations of MECs have been developed in the past few +decades, and it is known, in particular that all DAGs in the same MEC must have +the same "skeleton" (underlying undirected graph) and v-structures (induced +subgraph of the form $a\rightarrow b \leftarrow c$). + These combinatorial characterizations also suggest several natural +algorithmic questions. One of these is: given an undirected graph $G$ as input, +how many distinct Markov equivalence classes have the skeleton $G$? Much work +has been devoted in the last few years to this and other closely related +problems. However, to the best of our knowledge, a polynomial time algorithm +for the problem remains unknown. + In this paper, we make progress towards this goal by giving a fixed parameter +tractable algorithm for the above problem, with the parameters being the +treewidth and the maximum degree of the input graph $G$. The main technical +ingredient in our work is a construction we refer to as shadow, which lets us +create a "local description" of long-range constraints imposed by the +combinatorial characterizations of MECs. + +
+
+ comment: 75 pages, 2 Figures +
+
+
+
+
+ + ♻ ☆ DyGPrompt: Learning Feature and Time Prompts on Dynamic Graphs + + +
+ Dynamic graphs are pervasive in the real world, modeling dynamic relations +between objects across various fields. For dynamic graph modeling, dynamic +graph neural networks (DGNNs) have emerged as a mainstream technique, which are +generally pre-trained on the link prediction task, leaving a significant gap +from the objectives of downstream tasks such as node classification. To bridge +the gap, prompt-based learning has gained traction on graphs. However, existing +efforts focus on static graphs, neglecting the evolution of dynamic graphs. In +this paper, we propose DyGPrompt, a novel pre-training and prompting framework +for dynamic graph modeling. First, we design dual prompts to address the gap in +both task objectives and dynamic variations across pre-training and downstream +tasks. Second, we recognize that node and time features mutually characterize +each other, and propose dual condition-nets to model the evolving node-time +patterns in downstream tasks. Finally, we thoroughly evaluate and analyze +DyGPrompt through extensive experiments on three public datasets. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ ADD 2022: the First Audio Deep Synthesis Detection Challenge ICASSP 2022 + + +
+ Audio deepfake detection is an emerging topic, which was included in the +ASVspoof 2021. However, the recent shared tasks have not covered many real-life +and challenging scenarios. The first Audio Deep synthesis Detection challenge +(ADD) was motivated to fill in the gap. The ADD 2022 includes three tracks: +low-quality fake audio detection (LF), partially fake audio detection (PF) and +audio fake game (FG). The LF track focuses on dealing with bona fide and fully +fake utterances with various real-world noises etc. The PF track aims to +distinguish the partially fake audio from the real. The FG track is a rivalry +game, which includes two tasks: an audio generation task and an audio fake +detection task. In this paper, we describe the datasets, evaluation metrics, +and protocols. We also report major findings that reflect the recent advances +in audio deepfake detection tasks. + +
+
+ comment: Accepted by ICASSP 2022 +
+
+
+
+
+ + ♻ ☆ Introducing a Physics-informed Deep Learning Framework for Bridge Scour + Prediction + + +
+ This paper introduces scour physics-informed neural networks (SPINNs), a +hybrid physics-data-driven framework for bridge scour prediction using deep +learning. SPINNs are developed based on historical scour monitoring data and +integrate physics-based empirical equations into neural networks as +supplementary loss components. We incorporated three architectures: LSTM, CNN, +and NLinear as the base data-driven model. Despite varying performance across +different base models and bridges, SPINNs overall outperformed pure data-driven +models. In some bridge cases, SPINN reduced forecasting errors by up to 50 +percent. In this study, we also explored general models for bridge clusters, +trained by aggregating datasets across multiple bridges in a region. The pure +data-driven models mostly benefited from this approach, in particular bridges +with limited data. However, bridge-specific SPINNs provided more accurate +predictions than general SPINNs for almost all case studies. Also, the +time-dependent empirical equations derived from SPINNs showed reasonable +accuracy in estimating maximum scour depth, providing more accurate predictions +compared to HEC-18. Comparing both SPINNs and pure deep learning models with +traditional HEC-18 equation indicates substantial improvements in scour +prediction accuracy. This study can pave the way for hybrid physics-machine +learning methodologies to be implemented for bridge scour design and +maintenance. + +
+
+
+
+
+ + ♻ ☆ Swish-T : Enhancing Swish Activation with Tanh Bias for Improved Neural + Network Performance + + +
+ We propose the Swish-T family, an enhancement of the existing non-monotonic +activation function Swish. Swish-T is defined by adding a Tanh bias to the +original Swish function. This modification creates a family of Swish-T +variants, each designed to excel in different tasks, showcasing specific +advantages depending on the application context. The Tanh bias allows for +broader acceptance of negative values during initial training stages, offering +a smoother non-monotonic curve than the original Swish. We ultimately propose +the Swish-T$_{\textbf{C}}$ function, while Swish-T and Swish-T$_{\textbf{B}}$, +byproducts of Swish-T$_{\textbf{C}}$, also demonstrate satisfactory +performance. Furthermore, our ablation study shows that using +Swish-T$_{\textbf{C}}$ as a non-parametric function can still achieve high +performance. The superiority of the Swish-T family has been empirically +demonstrated across various models and benchmark datasets, including MNIST, +Fashion MNIST, SVHN, CIFAR-10, and CIFAR-100. The code is publicly available at +"https://github.com/ictseoyoungmin/Swish-T-pytorch". + +
+
+ comment: 11 pages, 6 figures Revised the derivative of the sigmoid function + from 1-sigmoid to sigmoid(1-sigmoid) for correctness.Updated related + equations in Section 3.2 Conclusions to Conclusion in Section 6 +
+
+
+
+
+ + ♻ ☆ Towards Unsupervised Question Answering System with Multi-level + Summarization for Legal Text + + +
+ This paper summarizes Team SCaLAR's work on SemEval-2024 Task 5: Legal +Argument Reasoning in Civil Procedure. To address this Binary Classification +task, which was daunting due to the complexity of the Legal Texts involved, we +propose a simple yet novel similarity and distance-based unsupervised approach +to generate labels. Further, we explore the Multi-level fusion of Legal-Bert +embeddings using ensemble features, including CNN, GRU, and LSTM. To address +the lengthy nature of Legal explanation in the dataset, we introduce T5-based +segment-wise summarization, which successfully retained crucial information, +enhancing the model's performance. Our unsupervised system witnessed a 20-point +increase in macro F1-score on the development set and a 10-point increase on +the test set, which is promising given its uncomplicated architecture. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Metric-guided Image Reconstruction Bounds via Conformal Prediction + + +
+ Recent advancements in machine learning have led to the development of novel +medical imaging systems and algorithms that address ill-posed problems. +Assessing their trustworthiness and understanding how to deploy them safely at +test time remains an important and open problem. In this work, we propose using +conformal prediction to compute valid and distribution-free bounds on +downstream metrics given reconstructions generated by one algorithm, and +retrieve upper/lower bounds and inlier/outlier reconstructions according to the +adjusted bounds. Our work offers 1) test time image reconstruction evaluation +without ground truth, 2) downstream performance guarantees, 3) meaningful +upper/lower bound reconstructions, and 4) meaningful statistical +inliers/outlier reconstructions. We demonstrate our method on post-mastectomy +radiotherapy planning using 3D breast CT reconstructions, and show 1) that +metric-guided bounds have valid coverage for downstream metrics while +conventional pixel-wise bounds do not and 2) anatomical differences of +upper/lower bounds between metric-guided and pixel-wise methods. Our work paves +way for more meaningful and trustworthy test-time evaluation of medical image +reconstructions. Code available at +https://github.com/matthewyccheung/conformal-metric + +
+
+
+
+
+ + ♻ ☆ A Curious Case of Searching for the Correlation between Training Data + and Adversarial Robustness of Transformer Textual Models ACL + + +
+ Existing works have shown that fine-tuned textual transformer models achieve +state-of-the-art prediction performances but are also vulnerable to adversarial +text perturbations. Traditional adversarial evaluation is often done +\textit{only after} fine-tuning the models and ignoring the training data. In +this paper, we want to prove that there is also a strong correlation between +training data and model robustness. To this end, we extract 13 different +features representing a wide range of input fine-tuning corpora properties and +use them to predict the adversarial robustness of the fine-tuned models. +Focusing mostly on encoder-only transformer models BERT and RoBERTa with +additional results for BART, ELECTRA, and GPT2, we provide diverse evidence to +support our argument. First, empirical analyses show that (a) extracted +features can be used with a lightweight classifier such as Random Forest to +predict the attack success rate effectively, and (b) features with the most +influence on the model robustness have a clear correlation with the robustness. +Second, our framework can be used as a fast and effective additional tool for +robustness evaluation since it (a) saves 30x-193x runtime compared to the +traditional technique, (b) is transferable across models, (c) can be used under +adversarial training, and (d) robust to statistical randomness. Our code is +publicly available at \url{https://github.com/CaptainCuong/RobustText_ACL2024}. + +
+
+ comment: Accepted to ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ Towards Universal Mesh Movement Networks + + +
+ Solving complex Partial Differential Equations (PDEs) accurately and +efficiently is an essential and challenging problem in all scientific and +engineering disciplines. Mesh movement methods provide the capability to +improve the accuracy of the numerical solution without increasing the overall +mesh degree of freedom count. Conventional sophisticated mesh movement methods +are extremely expensive and struggle to handle scenarios with complex boundary +geometries. However, existing learning-based methods require re-training from +scratch given a different PDE type or boundary geometry, which limits their +applicability, and also often suffer from robustness issues in the form of +inverted elements. In this paper, we introduce the Universal Mesh Movement +Network (UM2N), which -- once trained -- can be applied in a non-intrusive, +zero-shot manner to move meshes with different size distributions and +structures, for solvers applicable to different PDE types and boundary +geometries. UM2N consists of a Graph Transformer (GT) encoder for extracting +features and a Graph Attention Network (GAT) based decoder for moving the mesh. +We evaluate our method on advection and Navier-Stokes based examples, as well +as a real-world tsunami simulation case. Our method outperforms existing +learning-based mesh movement methods in terms of the benchmarks described +above. In comparison to the conventional sophisticated Monge-Amp\`ere +PDE-solver based method, our approach not only significantly accelerates mesh +movement, but also proves effective in scenarios where the conventional method +fails. Our project page is at https://erizmr.github.io/UM2N/. + +
+
+
+
+
+ + ♻ ☆ Divide And Conquer: Learning Chaotic Dynamical Systems With Multistep + Penalty Neural Ordinary Differential Equations + + +
+ Forecasting high-dimensional dynamical systems is a fundamental challenge in +various fields, such as the geosciences and engineering. Neural Ordinary +Differential Equations (NODEs), which combine the power of neural networks and +numerical solvers, have emerged as a promising algorithm for forecasting +complex nonlinear dynamical systems. However, classical techniques used for +NODE training are ineffective for learning chaotic dynamical systems. In this +work, we propose a novel NODE-training approach that allows for robust learning +of chaotic dynamical systems. Our method addresses the challenges of +non-convexity and exploding gradients associated with underlying chaotic +dynamics. Training data trajectories from such systems are split into multiple, +non-overlapping time windows. In addition to the deviation from the training +data, the optimization loss term further penalizes the discontinuities of the +predicted trajectory between the time windows. The window size is selected +based on the fastest Lyapunov time scale of the system. Multi-step penalty(MP) +method is first demonstrated on Lorenz equation, to illustrate how it improves +the loss landscape and thereby accelerating the optimization convergence. MP +method can optimize chaotic systems in a manner similar to least-squares +shadowing with significantly lower computational costs. Our proposed algorithm, +denoted the Multistep Penalty NODE(MP-NODE), is applied to chaotic systems such +as the Kuramoto-Sivashinsky equation and the two-dimensional Kolmogorov flow. +It is observed that MP-NODE provide viable performance for such chaotic +systems, not only for short-term trajectory predictions but also for invariant +statistics that are hallmarks of the chaotic nature of these dynamics. + +
+
+ comment: 20 pages, 10 Figures, submitted to Journal of Computational Physics +
+
+
+
+
+ + ♻ ☆ Evidential Uncertainty Sets in Deep Classifiers Using Conformal + Prediction + + +
+ In this paper, we propose Evidential Conformal Prediction (ECP) method for +image classifiers to generate the conformal prediction sets. Our method is +designed based on a non-conformity score function that has its roots in +Evidential Deep Learning (EDL) as a method of quantifying model (epistemic) +uncertainty in DNN classifiers. We use evidence that are derived from the logit +values of target labels to compute the components of our non-conformity score +function: the heuristic notion of uncertainty in CP, uncertainty surprisal, and +expected utility. Our extensive experimental evaluation demonstrates that ECP +outperforms three state-of-the-art methods for generating CP sets, in terms of +their set sizes and adaptivity while maintaining the coverage of true labels. + +
+
+ comment: Accepted in 13th Symposium on Conformal and Probabilistic Prediction + with Applications (COPA2024). To be published in the Proceedings of Machine + Learning Research (PMLR), vol. 230, 2024 (24 Pages) +
+
+
+
+
+ + ♻ ☆ Rethinking Machine Unlearning for Large Language Models + + +
+ We explore machine unlearning (MU) in the domain of large language models +(LLMs), referred to as LLM unlearning. This initiative aims to eliminate +undesirable data influence (e.g., sensitive or illegal information) and the +associated model capabilities, while maintaining the integrity of essential +knowledge generation and not affecting causally unrelated information. We +envision LLM unlearning becoming a pivotal element in the life-cycle management +of LLMs, potentially standing as an essential foundation for developing +generative AI that is not only safe, secure, and trustworthy, but also +resource-efficient without the need of full retraining. We navigate the +unlearning landscape in LLMs from conceptual formulation, methodologies, +metrics, and applications. In particular, we highlight the often-overlooked +aspects of existing LLM unlearning research, e.g., unlearning scope, data-model +interaction, and multifaceted efficacy assessment. We also draw connections +between LLM unlearning and related areas such as model editing, influence +functions, model explanation, adversarial training, and reinforcement learning. +Furthermore, we outline an effective assessment framework for LLM unlearning +and explore its applications in copyright and privacy safeguards and +sociotechnical harm reduction. + +
+
+
+
+
+ + ♻ ☆ Fair Resource Allocation in Multi-Task Learning + + +
+ By jointly learning multiple tasks, multi-task learning (MTL) can leverage +the shared knowledge across tasks, resulting in improved data efficiency and +generalization performance. However, a major challenge in MTL lies in the +presence of conflicting gradients, which can hinder the fair optimization of +some tasks and subsequently impede MTL's ability to achieve better overall +performance. Inspired by fair resource allocation in communication networks, we +formulate the optimization of MTL as a utility maximization problem, where the +loss decreases across tasks are maximized under different fairness +measurements. To solve this problem, we propose FairGrad, a novel MTL +optimization method. FairGrad not only enables flexible emphasis on certain +tasks but also achieves a theoretical convergence guarantee. Extensive +experiments demonstrate that our method can achieve state-of-the-art +performance among gradient manipulation methods on a suite of multi-task +benchmarks in supervised learning and reinforcement learning. Furthermore, we +incorporate the idea of $\alpha$-fairness into loss functions of various MTL +methods. Extensive empirical studies demonstrate that their performance can be +significantly enhanced. Code is provided at +\url{https://github.com/OptMN-Lab/fairgrad}. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Graph Diffusion with Applications in Personalized + PageRanks + + +
+ Graph diffusion, which iteratively propagates real-valued substances among +the graph, is used in numerous graph/network-involved applications. However, +releasing diffusion vectors may reveal sensitive linking information in the +data such as transaction information in financial network data. However, +protecting the privacy of graph data is challenging due to its interconnected +nature. This work proposes a novel graph diffusion framework with edge-level +differential privacy guarantees by using noisy diffusion iterates. The +algorithm injects Laplace noise per diffusion iteration and adopts a +degree-based thresholding function to mitigate the high sensitivity induced by +low-degree nodes. Our privacy loss analysis is based on Privacy Amplification +by Iteration (PABI), which to our best knowledge, is the first effort that +analyzes PABI with Laplace noise and provides relevant applications. We also +introduce a novel Infinity-Wasserstein distance tracking method, which tightens +the analysis of privacy leakage and makes PABI more applicable in practice. We +evaluate this framework by applying it to Personalized Pagerank computation for +ranking tasks. Experiments on real-world network data demonstrate the +superiority of our method under stringent privacy conditions. + +
+
+
+
+
+ + ♻ ☆ AutoRT: Embodied Foundation Models for Large Scale Orchestration of + Robotic Agents ICRA 2024 + + +
+ Foundation models that incorporate language, vision, and more recently +actions have revolutionized the ability to harness internet scale data to +reason about useful tasks. However, one of the key challenges of training +embodied foundation models is the lack of data grounded in the physical world. +In this paper, we propose AutoRT, a system that leverages existing foundation +models to scale up the deployment of operational robots in completely unseen +scenarios with minimal human supervision. AutoRT leverages vision-language +models (VLMs) for scene understanding and grounding, and further uses large +language models (LLMs) for proposing diverse and novel instructions to be +performed by a fleet of robots. Guiding data collection by tapping into the +knowledge of foundation models enables AutoRT to effectively reason about +autonomy tradeoffs and safety while significantly scaling up data collection +for robot learning. We demonstrate AutoRT proposing instructions to over 20 +robots across multiple buildings and collecting 77k real robot episodes via +both teleoperation and autonomous robot policies. We experimentally show that +such "in-the-wild" data collected by AutoRT is significantly more diverse, and +that AutoRT's use of LLMs allows for instruction following data collection +robots that can align to human preferences. + +
+
+ comment: 26 pages, 9 figures, ICRA 2024 VLMNM Workshop +
+
+
+
+
+ + ♻ ☆ FLea: Addressing Data Scarcity and Label Skew in Federated Learning via + Privacy-preserving Feature Augmentation KDD'24 + + +
+ Federated Learning (FL) enables model development by leveraging data +distributed across numerous edge devices without transferring local data to a +central server. However, existing FL methods still face challenges when dealing +with scarce and label-skewed data across devices, resulting in local model +overfitting and drift, consequently hindering the performance of the global +model. In response to these challenges, we propose a pioneering framework +called \textit{FLea}, incorporating the following key components: \textit{i)} A +global feature buffer that stores activation-target pairs shared from multiple +clients to support local training. This design mitigates local model drift +caused by the absence of certain classes; \textit{ii)} A feature augmentation +approach based on local and global activation mix-ups for local training. This +strategy enlarges the training samples, thereby reducing the risk of local +overfitting; \textit{iii)} An obfuscation method to minimize the correlation +between intermediate activations and the source data, enhancing the privacy of +shared features. To verify the superiority of \textit{FLea}, we conduct +extensive experiments using a wide range of data modalities, simulating +different levels of local data scarcity and label skew. The results demonstrate +that \textit{FLea} consistently outperforms state-of-the-art FL counterparts +(among 13 of the experimented 18 settings, the improvement is over $5\%$) while +concurrently mitigating the privacy vulnerabilities associated with shared +features. Code is available at https://github.com/XTxiatong/FLea.git + +
+
+ comment: This paper has been acceped by KDD'24 +
+
+
+
+
+ + ♻ ☆ Knowledge Transfer from Vision Foundation Models for Efficient Training + of Small Task-specific Models + + +
+ Vision Foundation Models (VFMs) pretrained on massive datasets exhibit +impressive performance on various downstream tasks, especially with limited +labeled target data. However, due to their high inference compute cost, these +models cannot be deployed for many real-world applications. Motivated by this, +we ask the following important question, "How can we leverage the knowledge +from a large VFM to train a small task-specific model for a new target task +with limited labeled training data?", and propose a simple task-oriented +knowledge transfer approach as a highly effective solution to this problem. Our +experimental results on five target tasks show that the proposed approach +outperforms task-agnostic VFM distillation, web-scale CLIP pretraining, +supervised ImageNet pretraining, and self-supervised DINO pretraining by up to +11.6%, 22.1%, 13.7%, and 29.8%, respectively. Furthermore, the proposed +approach also demonstrates up to 9x, 4x and 15x reduction in pretraining +compute cost when compared to task-agnostic VFM distillation, ImageNet +pretraining and DINO pretraining, respectively, while outperforming them. We +also show that the dataset used for transferring knowledge has a significant +effect on the final target task performance, and introduce a +retrieval-augmented knowledge transfer strategy that uses web-scale image +retrieval to curate effective transfer sets. + +
+
+ comment: International Conference on Machine Learning, 2024 +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ Volume Tracking Based Reference Mesh Extraction for Time-Varying Mesh + Compression + + +
+ Time-Varying meshes (TVMs), characterized by their varying connectivity and +number of vertices, hold significant potential in immersive media and other +various applications. However, their practical utilization is challenging due +to their time-varying features and large file sizes. Creating a reference mesh +that contains the most essential information is a promising approach to +utilizing shared information within TVMs to reduce storage and transmission +costs. We propose a novel method that employs volume tracking to extract +reference meshes. First, we adopt as-rigid-as-possible (ARAP) volume tracking +on TVMs to get the volume centers for each mesh. Then, we use multidimensional +scaling (MDS) to get reference centers that ensure the reference mesh avoids +self-contact regions. Finally, we map the vertices of the meshes to reference +centers and extract the reference mesh. Our approach offers a feasible solution +for extracting reference meshes that can serve multiple purposes such as +establishing surface correspondence, deforming the reference mesh to different +shapes for I-frame based mesh compression, or defining the global shape of the +TVMs. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Video Watermarking: Safeguarding Your Video from (Unauthorized) + Annotations by Video-based LLMs + + +
+ The advent of video-based Large Language Models (LLMs) has significantly +enhanced video understanding. However, it has also raised some safety concerns +regarding data protection, as videos can be more easily annotated, even without +authorization. This paper introduces Video Watermarking, a novel technique to +protect videos from unauthorized annotations by such video-based LLMs, +especially concerning the video content and description, in response to +specific queries. By imperceptibly embedding watermarks into key video frames +with multi-modal flow-based losses, our method preserves the viewing experience +while preventing misuse by video-based LLMs. Extensive experiments show that +Video Watermarking significantly reduces the comprehensibility of videos with +various video-based LLMs, demonstrating both stealth and robustness. In +essence, our method provides a solution for securing video content, ensuring +its integrity and confidentiality in the face of evolving video-based LLMs +technologies. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2403.13507 +
+
+
+
+
+ + ☆ SafaRi:Adaptive Sequence Transformer for Weakly Supervised Referring + Expression Segmentation ECCV 2024 + + +
+ Referring Expression Segmentation (RES) aims to provide a segmentation mask +of the target object in an image referred to by the text (i.e., referring +expression). Existing methods require large-scale mask annotations. Moreover, +such approaches do not generalize well to unseen/zero-shot scenarios. To +address the aforementioned issues, we propose a weakly-supervised bootstrapping +architecture for RES with several new algorithmic innovations. To the best of +our knowledge, ours is the first approach that considers only a fraction of +both mask and box annotations (shown in Figure 1 and Table 1) for training. To +enable principled training of models in such low-annotation settings, improve +image-text region-level alignment, and further enhance spatial localization of +the target object in the image, we propose Cross-modal Fusion with Attention +Consistency module. For automatic pseudo-labeling of unlabeled samples, we +introduce a novel Mask Validity Filtering routine based on a spatially aware +zero-shot proposal scoring approach. Extensive experiments show that with just +30% annotations, our model SafaRi achieves 59.31 and 48.26 mIoUs as compared to +58.93 and 48.19 mIoUs obtained by the fully-supervised SOTA method SeqTR +respectively on RefCOCO+@testA and RefCOCO+testB datasets. SafaRi also +outperforms SeqTR by 11.7% (on RefCOCO+testA) and 19.6% (on RefCOCO+testB) in a +fully-supervised setting and demonstrates strong generalization capabilities in +unseen/zero-shot tasks. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ Joint-Dataset Learning and Cross-Consistent Regularization for + Text-to-Motion Retrieval + + +
+ Pose-estimation methods enable extracting human motion from common videos in +the structured form of 3D skeleton sequences. Despite great application +opportunities, effective content-based access to such spatio-temporal motion +data is a challenging problem. In this paper, we focus on the recently +introduced text-motion retrieval tasks, which aim to search for database +motions that are the most relevant to a specified natural-language textual +description (text-to-motion) and vice-versa (motion-to-text). Despite recent +efforts to explore these promising avenues, a primary challenge remains the +insufficient data available to train robust text-motion models effectively. To +address this issue, we propose to investigate joint-dataset learning - where we +train on multiple text-motion datasets simultaneously - together with the +introduction of a Cross-Consistent Contrastive Loss function (CCCL), which +regularizes the learned text-motion common space by imposing uni-modal +constraints that augment the representation ability of the trained network. To +learn a proper motion representation, we also introduce a transformer-based +motion encoder, called MoT++, which employs spatio-temporal attention to +process sequences of skeleton data. We demonstrate the benefits of the proposed +approaches on the widely-used KIT Motion-Language and HumanML3D datasets. We +perform detailed experimentation on joint-dataset learning and cross-dataset +scenarios, showing the effectiveness of each introduced module in a carefully +conducted ablation study and, in turn, pointing out the limitations of +state-of-the-art methods. + +
+
+
+
+
+ + ☆ ScaleDreamer: Scalable Text-to-3D Synthesis with Asynchronous Score + Distillation ECCV 2024 + + +
+ By leveraging the text-to-image diffusion priors, score distillation can +synthesize 3D contents without paired text-3D training data. Instead of +spending hours of online optimization per text prompt, recent studies have been +focused on learning a text-to-3D generative network for amortizing multiple +text-3D relations, which can synthesize 3D contents in seconds. However, +existing score distillation methods are hard to scale up to a large amount of +text prompts due to the difficulties in aligning pretrained diffusion prior +with the distribution of rendered images from various text prompts. Current +state-of-the-arts such as Variational Score Distillation finetune the +pretrained diffusion model to minimize the noise prediction error so as to +align the distributions, which are however unstable to train and will impair +the model's comprehension capability to numerous text prompts. Based on the +observation that the diffusion models tend to have lower noise prediction +errors at earlier timesteps, we propose Asynchronous Score Distillation (ASD), +which minimizes the noise prediction error by shifting the diffusion timestep +to earlier ones. ASD is stable to train and can scale up to 100k prompts. It +reduces the noise prediction error without changing the weights of pre-trained +diffusion model, thus keeping its strong comprehension capability to prompts. +We conduct extensive experiments across different 2D diffusion models, +including Stable Diffusion and MVDream, and text-to-3D generators, including +Hyper-iNGP, 3DConv-Net and Triplane-Transformer. The results demonstrate ASD's +effectiveness in stable 3D generator training, high-quality 3D content +synthesis, and its superior prompt-consistency, especially under large prompt +corpus. + +
+
+ comment: Accepted by ECCV 2024. Code available at + https://github.com/theEricMa/ScaleDreamer +
+
+
+
+
+ + ☆ A Bounding Box is Worth One Token: Interleaving Layout and Text in a + Large Language Model for Document Understanding + + +
+ Recently, many studies have demonstrated that exclusively incorporating +OCR-derived text and spatial layouts with large language models (LLMs) can be +highly effective for document understanding tasks. However, existing methods +that integrate spatial layouts with text have limitations, such as producing +overly long text sequences or failing to fully leverage the autoregressive +traits of LLMs. In this work, we introduce Interleaving Layout and Text in a +Large Language Model (LayTextLLM)} for document understanding. In particular, +LayTextLLM projects each bounding box to a single embedding and interleaves it +with text, efficiently avoiding long sequence issues while leveraging +autoregressive traits of LLMs. LayTextLLM not only streamlines the interaction +of layout and textual data but also shows enhanced performance in Key +Information Extraction (KIE) and Visual Question Answering (VQA). Comprehensive +benchmark evaluations reveal significant improvements, with a 27.0% increase on +KIE tasks and 24.1% on VQA tasks compared to previous state-of-the-art document +understanding MLLMs, as well as a 15.5% improvement over other SOTA OCR-based +LLMs on KIE tasks. + +
+
+
+
+
+ + ☆ To Forget or Not? Towards Practical Knowledge Unlearning for Large + Language Models + + +
+ Large Language Models (LLMs) trained on extensive corpora inevitably retain +sensitive data, such as personal privacy information and copyrighted material. +Recent advancements in knowledge unlearning involve updating LLM parameters to +erase specific knowledge. However, current unlearning paradigms are mired in +vague forgetting boundaries, often erasing knowledge indiscriminately. In this +work, we introduce KnowUnDo, a benchmark containing copyrighted content and +user privacy domains to evaluate if the unlearning process inadvertently erases +essential knowledge. Our findings indicate that existing unlearning methods +often suffer from excessive unlearning. To address this, we propose a simple +yet effective method, MemFlex, which utilizes gradient information to precisely +target and unlearn sensitive parameters. Experimental results show that MemFlex +is superior to existing methods in both precise knowledge unlearning and +general knowledge retaining of LLMs. Code and dataset will be released at +https://github.com/zjunlp/KnowUnDo. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech + Recognition + + +
+ Audio-visual speech recognition (AVSR) is a multimodal extension of automatic +speech recognition (ASR), using video as a complement to audio. In AVSR, +considerable efforts have been directed at datasets for facial features such as +lip-readings, while they often fall short in evaluating the image comprehension +capabilities in broader contexts. In this paper, we construct SlideAVSR, an +AVSR dataset using scientific paper explanation videos. SlideAVSR provides a +new benchmark where models transcribe speech utterances with texts on the +slides on the presentation recordings. As technical terminologies that are +frequent in paper explanations are notoriously challenging to transcribe +without reference texts, our SlideAVSR dataset spotlights a new aspect of AVSR +problems. As a simple yet effective baseline, we propose DocWhisper, an AVSR +model that can refer to textual information from slides, and confirm its +effectiveness on SlideAVSR. + +
+
+ comment: 3rd Workshop on Advances in Language and Vision Research (ALVR 2024) +
+
+
+
+
+ + ♻ ☆ Band-Attention Modulated RetNet for Face Forgery Detection + + +
+ The transformer networks are extensively utilized in face forgery detection +due to their scalability across large datasets.Despite their success, +transformers face challenges in balancing the capture of global context, which +is crucial for unveiling forgery clues, with computational complexity.To +mitigate this issue, we introduce Band-Attention modulated RetNet (BAR-Net), a +lightweight network designed to efficiently process extensive visual contexts +while avoiding catastrophic forgetting.Our approach empowers the target token +to perceive global information by assigning differential attention levels to +tokens at varying distances. We implement self-attention along both spatial +axes, thereby maintaining spatial priors and easing the computational +burden.Moreover, we present the adaptive frequency Band-Attention Modulation +mechanism, which treats the entire Discrete Cosine Transform spectrogram as a +series of frequency bands with learnable weights.Together, BAR-Net achieves +favorable performance on several face forgery datasets, outperforming current +state-of-the-art methods. + +
+
+ comment: The essay is poorly expressed in writing and will be re-optimised +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 71 + +
+
+
+ + ☆ Improving Multilingual Instruction Finetuning via Linguistically Natural + and Diverse Datasets + + +
+ Advancements in Large Language Models (LLMs) have significantly enhanced +instruction-following capabilities. However, most Instruction Fine-Tuning (IFT) +datasets are predominantly in English, limiting model performance in other +languages. Traditional methods for creating multilingual IFT datasets such as +translating existing English IFT datasets or converting existing NLP datasets +into IFT datasets by templating, struggle to capture linguistic nuances and +ensure prompt (instruction) diversity. To address this issue, we propose a +novel method for collecting multilingual IFT datasets that preserves linguistic +naturalness and ensures prompt diversity. This approach leverages +English-focused LLMs, monolingual corpora, and a scoring function to create +high-quality, diversified IFT datasets in multiple languages. Experiments +demonstrate that LLMs finetuned using these IFT datasets show notable +improvements in both generative and discriminative tasks, indicating enhanced +language comprehension by LLMs in non-English contexts. Specifically, on the +multilingual summarization task, LLMs using our IFT dataset achieved 17.57% and +15.23% improvements over LLMs fine-tuned with translation-based and +template-based datasets, respectively. + +
+
+
+
+
+ + ☆ Purple-teaming LLMs with Adversarial Defender Training + + +
+ Existing efforts in safeguarding LLMs are limited in actively exposing the +vulnerabilities of the target LLM and readily adapting to newly emerging safety +risks. To address this, we present Purple-teaming LLMs with Adversarial +Defender training (PAD), a pipeline designed to safeguard LLMs by novelly +incorporating the red-teaming (attack) and blue-teaming (safety training) +techniques. In PAD, we automatically collect conversational data that cover the +vulnerabilities of an LLM around specific safety risks in a self-play manner, +where the attacker aims to elicit unsafe responses and the defender generates +safe responses to these attacks. We then update both modules in a generative +adversarial network style by training the attacker to elicit more unsafe +responses and updating the defender to identify them and explain the unsafe +reason. Experimental results demonstrate that PAD significantly outperforms +existing baselines in both finding effective attacks and establishing a robust +safe guardrail. Furthermore, our findings indicate that PAD excels in striking +a balance between safety and overall model quality. We also reveal key +challenges in safeguarding LLMs, including defending multi-turn attacks and the +need for more delicate strategies to identify specific risks. + +
+
+
+
+
+ + ☆ A Study of Nationality Bias in Names and Perplexity using Off-the-Shelf + Affect-related Tweet Classifiers + + +
+ In this paper, we apply a method to quantify biases associated with named +entities from various countries. We create counterfactual examples with small +perturbations on target-domain data instead of relying on templates or specific +datasets for bias detection. On widely used classifiers for subjectivity +analysis, including sentiment, emotion, hate speech, and offensive text using +Twitter data, our results demonstrate positive biases related to the language +spoken in a country across all classifiers studied. Notably, the presence of +certain country names in a sentence can strongly influence predictions, up to a +23\% change in hate speech detection and up to a 60\% change in the prediction +of negative emotions such as anger. We hypothesize that these biases stem from +the training data of pre-trained language models (PLMs) and find correlations +between affect predictions and PLMs likelihood in English and unknown languages +like Basque and Maori, revealing distinct patterns with exacerbate +correlations. Further, we followed these correlations in-between counterfactual +examples from a same sentence to remove the syntactical component, uncovering +interesting results suggesting the impact of the pre-training data was more +important for English-speaking-country names. Our anonymized code is +[https://anonymous.4open.science/r/biases_ppl-576B/README.md](available here). + +
+
+
+
+
+ + ☆ Empathic Grounding: Explorations using Multimodal Interaction and Large + Language Models with Conversational Agents + + +
+ We introduce the concept of "empathic grounding" in conversational agents as +an extension of Clark's conceptualization of grounding in conversation in which +the grounding criterion includes listener empathy for the speaker's affective +state. Empathic grounding is generally required whenever the speaker's emotions +are foregrounded and can make the grounding process more efficient and reliable +by communicating both propositional and affective understanding. Both speaker +expressions of affect and listener empathic grounding can be multimodal, +including facial expressions and other nonverbal displays. Thus, models of +empathic grounding for embodied agents should be multimodal to facilitate +natural and efficient communication. We describe a multimodal model that takes +as input user speech and facial expression to generate multimodal grounding +moves for a listening agent using a large language model. We also describe a +testbed to evaluate approaches to empathic grounding, in which a humanoid robot +interviews a user about a past episode of pain and then has the user rate their +perception of the robot's empathy. We compare our proposed model to one that +only generates non-affective grounding cues in a between-subjects experiment. +Findings demonstrate that empathic grounding increases user perceptions of +empathy, understanding, emotional intelligence, and trust. Our work highlights +the role of emotion awareness and multimodality in generating appropriate +grounding moves for conversational agents. + +
+
+
+
+
+ + ☆ Race and Privacy in Broadcast Police Communications SC + + +
+ Radios are essential for the operations of modern police departments, and +they function as both a collaborative communication technology and a +sociotechnical system. However, little prior research has examined their usage +or their connections to individual privacy and the role of race in policing, +two growing topics of concern in the US. As a case study, we examine the +Chicago Police Department's (CPD's) use of broadcast police communications +(BPC) to coordinate the activity of law enforcement officers (LEOs) in the +city. From a recently assembled archive of 80,775 hours of BPC associated with +CPD operations, we analyze text transcripts of radio transmissions broadcast +9:00 AM to 5:00 PM on August 10th, 2018 in one majority Black, one majority +white, and one majority Hispanic area of the city (24 hours of audio) to +explore three research questions: (1) Do BPC reflect reported racial +disparities in policing? (2) How and when is gender, race/ethnicity, and age +mentioned in BPC? (3) To what extent do BPC include sensitive information, and +who is put at most risk by this practice? (4) To what extent can large language +models (LLMs) heighten this risk? We explore the vocabulary and speech acts +used by police in BPC, comparing mentions of personal characteristics to local +demographics, the personal information shared over BPC, and the privacy +concerns that it poses. Analysis indicates (a) policing professionals in the +city of Chicago exhibit disproportionate attention to Black members of the +public regardless of context, (b) sociodemographic characteristics like gender, +race/ethnicity, and age are primarily mentioned in BPC about event information, +and (c) disproportionate attention introduces disproportionate privacy risks +for Black members of the public. + +
+
+ comment: Accepted in the 27th ACM Conference on Computer-Supported Cooperative + Work and Social Computing (CSCW '24) +
+
+
+
+
+ + ☆ Ground Every Sentence: Improving Retrieval-Augmented LLMs with + Interleaved Reference-Claim Generation + + +
+ Retrieval-Augmented Generation (RAG) has been widely adopted to enhance Large +Language Models (LLMs) in knowledge-intensive tasks. Recently, Attributed Text +Generation (ATG) has attracted growing attention, which provides citations to +support the model's responses in RAG, so as to enhance the credibility of +LLM-generated content and facilitate verification. Prior methods mainly adopt +coarse-grained attributions, linking to passage-level references or providing +paragraph-level citations. However, these methods still fall short in +verifiability and require certain time costs for fact checking. This paper +proposes a fine-grained ATG method called ReClaim(Refer & Claim), which +alternates the generation of references and answers step by step. Unlike +traditional coarse-grained attribution, ReClaim allows the model to add +sentence-level fine-grained citations to each answer sentence in long-form +question-answering tasks. Our experiments encompass various training and +inference methods and multiple LLMs, verifying the effectiveness of our +approach. + +
+
+ comment: 15 pages,2 figures +
+
+
+
+
+ + ☆ Analyzing Persuasive Strategies in Meme Texts: A Fusion of Language + Models with Paraphrase Enrichment + + +
+ This paper describes our approach to hierarchical multi-label detection of +persuasion techniques in meme texts. Our model, developed as a part of the +recent SemEval task, is based on fine-tuning individual language models (BERT, +XLM-RoBERTa, and mBERT) and leveraging a mean-based ensemble model in addition +to dataset augmentation through paraphrase generation from ChatGPT. The scope +of the study encompasses enhancing model performance through innovative +training techniques and data augmentation strategies. The problem addressed is +the effective identification and classification of multiple persuasive +techniques in meme texts, a task complicated by the diversity and complexity of +such content. The objective of the paper is to improve detection accuracy by +refining model training methods and examining the impact of balanced versus +unbalanced training datasets. Novelty in the results and discussion lies in the +finding that training with paraphrases enhances model performance, yet a +balanced training set proves more advantageous than a larger unbalanced one. +Additionally, the analysis reveals the potential pitfalls of indiscriminate +incorporation of paraphrases from diverse distributions, which can introduce +substantial noise. Results with the SemEval 2024 data confirm these insights, +demonstrating improved model efficacy with the proposed methods. + +
+
+ comment: 15 pages, 8 figures, 1 table, Proceedings of 5th International + Conference on Natural Language Processing and Applications (NLPA 2024) +
+
+
+
+
+ + ☆ DiscoveryBench: Towards Data-Driven Discovery with Large Language Models + + +
+ Can the rapid advances in code generation, function calling, and data +analysis using large language models (LLMs) help automate the search and +verification of hypotheses purely from a set of provided datasets? To evaluate +this question, we present DiscoveryBench, the first comprehensive benchmark +that formalizes the multi-step process of data-driven discovery. The benchmark +is designed to systematically assess current model capabilities in discovery +tasks and provide a useful resource for improving them. Our benchmark contains +264 tasks collected across 6 diverse domains, such as sociology and +engineering, by manually deriving discovery workflows from published papers to +approximate the real-world challenges faced by researchers, where each task is +defined by a dataset, its metadata, and a discovery goal in natural language. +We additionally provide 903 synthetic tasks to conduct controlled evaluations +across task complexity. Furthermore, our structured formalism of data-driven +discovery enables a facet-based evaluation that provides useful insights into +different failure modes. We evaluate several popular LLM-based reasoning +frameworks using both open and closed LLMs as baselines on DiscoveryBench and +find that even the best system scores only 25%. Our benchmark, thus, +illustrates the challenges in autonomous data-driven discovery and serves as a +valuable resource for the community to make progress. + +
+
+ comment: Website: https://github.com/allenai/discoverybench +
+
+
+
+
+ + ☆ NLPGuard: A Framework for Mitigating the Use of Protected Attributes by + NLP Classifiers SC + + +
+ AI regulations are expected to prohibit machine learning models from using +sensitive attributes during training. However, the latest Natural Language +Processing (NLP) classifiers, which rely on deep learning, operate as black-box +systems, complicating the detection and remediation of such misuse. Traditional +bias mitigation methods in NLP aim for comparable performance across different +groups based on attributes like gender or race but fail to address the +underlying issue of reliance on protected attributes. To partly fix that, we +introduce NLPGuard, a framework for mitigating the reliance on protected +attributes in NLP classifiers. NLPGuard takes an unlabeled dataset, an existing +NLP classifier, and its training data as input, producing a modified training +dataset that significantly reduces dependence on protected attributes without +compromising accuracy. NLPGuard is applied to three classification tasks: +identifying toxic language, sentiment analysis, and occupation classification. +Our evaluation shows that current NLP classifiers heavily depend on protected +attributes, with up to $23\%$ of the most predictive words associated with +these attributes. However, NLPGuard effectively reduces this reliance by up to +$79\%$, while slightly improving accuracy. + +
+
+ comment: Paper accepted at CSCW 2024 +
+
+
+
+
+ + ☆ Deciphering the Factors Influencing the Efficacy of Chain-of-Thought: + Probability, Memorization, and Noisy Reasoning + + +
+ Chain-of-Thought (CoT) prompting has been shown to enhance the multi-step +reasoning capabilities of Large Language Models (LLMs). However, debates +persist about whether LLMs exhibit abstract generalization or rely on shallow +heuristics when given CoT prompts. To understand the factors influencing CoT +reasoning we provide a detailed case study of the symbolic reasoning task of +decoding shift ciphers, where letters are shifted forward some number of steps +in the alphabet. GPT-4 achieves zero accuracy on most shift ciphers with +standard prompting, but with CoT its accuracy improves to an average of 32%. By +focusing on a single relatively simple task, we are able to identify three +factors that systematically affect CoT performance: the probability of the +task's expected output (probability), what the model has implicitly learned +during pre-training (memorization), and the number of intermediate operations +involved in reasoning (noisy reasoning). We show that these factors can +drastically influence the task accuracy; e.g., varying the output's probability +of occurrence can shift accuracy from 26% to 70%. We also demonstrate that it +is essential for the model to explicitly produce intermediate steps as output +that can be conditioned on to increase the probability of the correct answer. +Our experiments indicate that as long as the model does so, the validity of the +demonstrations in the prompt does not matter. Overall, we conclude that CoT +prompting performance reflects both memorization and a probabilistic version of +genuine reasoning. + +
+
+ comment: 9 pages plus references and appendices +
+
+
+
+
+ + ☆ KV Cache Compression, But What Must We Give in Return? A Comprehensive + Benchmark of Long Context Capable Approaches + + +
+ Long context capability is a crucial competency for large language models +(LLMs) as it mitigates the human struggle to digest long-form texts. This +capability enables complex task-solving scenarios such as book summarization, +code assistance, and many more tasks that are traditionally manpower-intensive. +However, transformer-based LLMs face significant challenges with long context +input due to the growing size of the KV cache and the intrinsic complexity of +attending to extended inputs; where multiple schools of efficiency-driven +approaches -- such as KV cache quantization, token dropping, prompt +compression, linear-time sequence models, and hybrid architectures -- have been +proposed to produce efficient yet long context-capable models. Despite these +advancements, no existing work has comprehensively benchmarked these methods in +a reasonably aligned environment. In this work, we fill this gap by providing a +taxonomy of current methods and evaluating 10+ state-of-the-art approaches +across seven categories of long context tasks. Our work reveals numerous +previously unknown phenomena and offers insights -- as well as a friendly +workbench -- for the future development of long context-capable LLMs. The +source code will be available at https://github.com/henryzhongsc/longctx_bench + +
+
+
+
+
+ + ☆ MMLongBench-Doc: Benchmarking Long-context Document Understanding with + Visualizations + + +
+ Understanding documents with rich layouts and multi-modal components is a +long-standing and practical task. Recent Large Vision-Language Models (LVLMs) +have made remarkable strides in various tasks, particularly in single-page +document understanding (DU). However, their abilities on long-context DU remain +an open problem. This work presents MMLongBench-Doc, a long-context, +multi-modal benchmark comprising 1,062 expert-annotated questions. Distinct +from previous datasets, it is constructed upon 130 lengthy PDF-formatted +documents with an average of 49.4 pages and 20,971 textual tokens. Towards +comprehensive evaluation, answers to these questions rely on pieces of evidence +from (1) different sources (text, image, chart, table, and layout structure) +and (2) various locations (i.e. page number). Moreover, 33.2% of the questions +are cross-page questions requiring evidence across multiple pages. 22.8% of the +questions are designed to be unanswerable for detecting potential +hallucinations. Experiments on 14 LVLMs demonstrate that long-context DU +greatly challenges current models. Notably, the best-performing model, GPT-4o, +achieves an F1 score of only 42.7%, while the second-best, GPT-4V, scores +31.4%. Furthermore, 12 LVLMs (all except GPT-4o and GPT-4V) even present worse +performance than their LLM counterparts which are fed with lossy-parsed OCR +documents. These results validate the necessity of future research toward more +capable long-context LVLMs. Project Page: +https://mayubo2333.github.io/MMLongBench-Doc + +
+
+
+
+
+ + ♻ ☆ Safe and Responsible Large Language Model : Can We Balance Bias + Reduction and Language Understanding in Large Language Models? + + +
+ Large Language Models (LLMs) have significantly advanced various NLP tasks. +However, these models often risk generating unsafe text that perpetuates +biases. Current approaches to produce unbiased outputs from LLMs can reduce +biases but at the expense of knowledge retention. In this research, we address +the question of whether producing safe (unbiased) outputs through LLMs can +retain knowledge and language understanding. In response, we developed the +Safety and Responsible Large Language Model (\textbf{SR}$_{\text{LLM}}$), an +LLM that has been instruction fine-tuned on top of already safe LLMs (e.g., +Llama2 or related) to diminish biases in generated text. To achieve our goals, +we compiled a specialized dataset designed to train our model in identifying +and correcting biased text. We conduct experiments, both on this custom data +and out-of-distribution test sets, to show the bias reduction and knowledge +retention. The results confirm that \textbf{SR}$_{\text{LLM}}$ outperforms +traditional fine-tuning and prompting methods in both reducing biases and +preserving the integrity of language knowledge. The significance of our +findings lies in demonstrating that instruction fine-tuning can provide a more +robust solution for bias reduction in LLMs. We have made our code and data +available at +\href{https://github.com/shainarazavi/Safe-Responsible-LLM}{Safe-LLM}. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Assume People are More Rational than We Really are + + +
+ In order for AI systems to communicate effectively with people, they must +understand how we make decisions. However, people's decisions are not always +rational, so the implicit internal models of human decision-making in Large +Language Models (LLMs) must account for this. Previous empirical evidence seems +to suggest that these implicit models are accurate -- LLMs offer believable +proxies of human behavior, acting how we expect humans would in everyday +interactions. However, by comparing LLM behavior and predictions to a large +dataset of human decisions, we find that this is actually not the case: when +both simulating and predicting people's choices, a suite of cutting-edge LLMs +(GPT-4o & 4-Turbo, Llama-3-8B & 70B, Claude 3 Opus) assume that people are more +rational than we really are. Specifically, these models deviate from human +behavior and align more closely with a classic model of rational choice -- +expected value theory. Interestingly, people also tend to assume that other +people are rational when interpreting their behavior. As a consequence, when we +compare the inferences that LLMs and people draw from the decisions of others +using another psychological dataset, we find that these inferences are highly +correlated. Thus, the implicit decision-making models of LLMs appear to be +aligned with the human expectation that other people will act rationally, +rather than with how people actually act. + +
+
+
+
+
+ + ♻ ☆ Predicting Text Preference Via Structured Comparative Reasoning + + +
+ Comparative reasoning plays a crucial role in text preference prediction; +however, large language models (LLMs) often demonstrate inconsistencies in +their reasoning. While approaches like Chain-of-Thought improve accuracy in +many other settings, they struggle to consistently distinguish the similarities +and differences of complex texts. We introduce SC, a prompting approach that +predicts text preferences by generating structured intermediate comparisons. SC +begins by proposing aspects of comparison, followed by generating textual +comparisons under each aspect. We select consistent comparisons with a pairwise +consistency comparator that ensures each aspect's comparisons clearly +distinguish differences between texts, significantly reducing hallucination and +improving consistency. Our comprehensive evaluations across various NLP tasks, +including summarization, retrieval, and automatic rating, demonstrate that SC +equips LLMs to achieve state-of-the-art performance in text preference +prediction. + +
+
+
+
+
+ + ♻ ☆ Does Writing with Language Models Reduce Content Diversity? ICLR 2024 + + +
+ Large language models (LLMs) have led to a surge in collaborative writing +with model assistance. As different users incorporate suggestions from the same +model, there is a risk of decreased diversity in the produced content, +potentially limiting diverse perspectives in public discourse. In this work, we +measure the impact of co-writing on diversity via a controlled experiment, +where users write argumentative essays in three setups -- using a base LLM +(GPT3), a feedback-tuned LLM (InstructGPT), and writing without model help. We +develop a set of diversity metrics and find that writing with InstructGPT (but +not the GPT3) results in a statistically significant reduction in diversity. +Specifically, it increases the similarity between the writings of different +authors and reduces the overall lexical and content diversity. We additionally +find that this effect is mainly attributable to InstructGPT contributing less +diverse text to co-written essays. In contrast, the user-contributed text +remains unaffected by model collaboration. This suggests that the recent +improvement in generation quality from adapting models to human feedback might +come at the cost of more homogeneous and less diverse content. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Video-Language Understanding: A Survey from Model Architecture, Model + Training, and Data Perspectives ACL 2024 + + +
+ Humans use multiple senses to comprehend the environment. Vision and language +are two of the most vital senses since they allow us to easily communicate our +thoughts and perceive the world around us. There has been a lot of interest in +creating video-language understanding systems with human-like senses since a +video-language pair can mimic both our linguistic medium and visual environment +with temporal dynamics. In this survey, we review the key tasks of these +systems and highlight the associated challenges. Based on the challenges, we +summarize their methods from model architecture, model training, and data +perspectives. We also conduct performance comparison among the methods, and +discuss promising directions for future research. + +
+
+ comment: Accepted at ACL 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ Explainability of machine learning approaches in forensic linguistics: a + case study in geolinguistic authorship profiling + + +
+ Forensic authorship profiling uses linguistic markers to infer +characteristics about an author of a text. This task is paralleled in dialect +classification, where a prediction is made about the linguistic variety of a +text based on the text itself. While there have been significant advances in +recent years in variety classification, forensic linguistics rarely relies on +these approaches due to their lack of transparency, among other reasons. In +this paper we therefore explore the explainability of machine learning +approaches considering the forensic context. We focus on variety classification +as a means of geolinguistic profiling of unknown texts based on social media +data from the German-speaking area. For this, we identify the lexical items +that are the most impactful for the variety classification. We find that the +extracted lexical features are indeed representative of their respective +varieties and note that the trained models also rely on place names for +classifications. + +
+
+
+
+
+ + ♻ ☆ Are LLMs Rational Investors? A Study on Detecting and Reducing the + Financial Bias in LLMs + + +
+ Large Language Models (LLMs) are increasingly adopted in financial analysis +for interpreting complex market data and trends. However, their use is +challenged by intrinsic biases (e.g., risk-preference bias) and a superficial +understanding of market intricacies, necessitating a thorough assessment of +their financial insight. To address these issues, we introduce Financial Bias +Indicators (FBI), a framework with components like Bias Unveiler, Bias +Detective, Bias Tracker, and Bias Antidote to identify, detect, analyze, and +eliminate irrational biases in LLMs. By combining behavioral finance principles +with bias examination, we evaluate 23 leading LLMs and propose a de-biasing +method based on financial causal knowledge. Results show varying degrees of +financial irrationality among models, influenced by their design and training. +Models trained specifically on financial datasets may exhibit more +irrationality, and even larger financial language models (FinLLMs) can show +more bias than smaller, general models. We utilize four prompt-based methods +incorporating causal debiasing, effectively reducing financial biases in these +models. This work enhances the understanding of LLMs' bias in financial +applications, laying the foundation for developing more reliable and rational +financial analysis tools. + +
+
+
+
+
+ + ♻ ☆ Rethinking Machine Ethics -- Can LLMs Perform Moral Reasoning through + the Lens of Moral Theories? + + +
+ Making moral judgments is an essential step toward developing ethical AI +systems. Prevalent approaches are mostly implemented in a bottom-up manner, +which uses a large set of annotated data to train models based on crowd-sourced +opinions about morality. These approaches have been criticized for +overgeneralizing the moral stances of a limited group of annotators and lacking +explainability. This work proposes a flexible top-down framework to steer +(Large) Language Models (LMs) to perform moral reasoning with well-established +moral theories from interdisciplinary research. The theory-guided top-down +framework can incorporate various moral theories. Our experiments demonstrate +the effectiveness of the proposed framework on datasets derived from moral +theories. Furthermore, we show the alignment between different moral theories +and existing morality datasets. Our analysis exhibits the potential and flaws +in existing resources (models and datasets) in developing explainable moral +judgment-making systems. + +
+
+
+
+
+ + ♻ ☆ Patch-Prompt Aligned Bayesian Prompt Tuning for Vision-Language Models UAI 2024 + + +
+ For downstream applications of vision-language pre-trained models, there has +been significant interest in constructing effective prompts. Existing works on +prompt engineering, which either require laborious manual designs or optimize +the prompt tuning as a point estimation problem, may fail to describe diverse +characteristics of categories and limit their applications. We introduce a +Bayesian probabilistic resolution to prompt tuning, where the label-specific +stochastic prompts are generated hierarchically by first sampling a latent +vector from an underlying distribution and then employing a lightweight +generative model. Importantly, we semantically regularize the tuning process by +minimizing the statistical distance between the visual patches and linguistic +prompts, which pushes the stochastic label representations to faithfully +capture diverse visual concepts, instead of overfitting the training +categories. We evaluate the effectiveness of our approach on four tasks: +few-shot image recognition, base-to-new generalization, dataset transfer +learning, and domain shifts. Extensive results over 15 datasets show promising +transferability and generalization performance of our proposed model, both +quantitatively and qualitatively. + +
+
+ comment: Accepted by UAI 2024 +
+
+
+
+
+ + ♻ ☆ ViANLI: Adversarial Natural Language Inference for Vietnamese + + +
+ The development of Natural Language Processing (NLI) datasets and models has +been inspired by innovations in annotation design. With the rapid development +of machine learning models today, the performance of existing machine learning +models has quickly reached state-of-the-art results on a variety of tasks +related to natural language processing, including natural language inference +tasks. By using a pre-trained model during the annotation process, it is +possible to challenge current NLI models by having humans produce +premise-hypothesis combinations that the machine model cannot correctly +predict. To remain attractive and challenging in the research of natural +language inference for Vietnamese, in this paper, we introduce the adversarial +NLI dataset to the NLP research community with the name ViANLI. This data set +contains more than 10K premise-hypothesis pairs and is built by a continuously +adjusting process to obtain the most out of the patterns generated by the +annotators. ViANLI dataset has brought many difficulties to many current SOTA +models when the accuracy of the most powerful model on the test set only +reached 48.4%. Additionally, the experimental results show that the models +trained on our dataset have significantly improved the results on other +Vietnamese NLI datasets. + +
+
+
+
+
+ + ♻ ☆ BeHonest: Benchmarking Honesty of Large Language Models + + +
+ Previous works on Large Language Models (LLMs) have mainly focused on +evaluating their helpfulness or harmlessness. However, honesty, another crucial +alignment criterion, has received relatively less attention. Dishonest +behaviors in LLMs, such as spreading misinformation and defrauding users, +eroding user trust, and causing real-world harm, present severe risks that +intensify as these models approach superintelligence levels. Enhancing honesty +in LLMs addresses critical deficiencies and helps uncover latent capabilities +that are not readily expressed. This underscores the urgent need for reliable +methods and benchmarks to effectively ensure and evaluate the honesty of LLMs. + In this paper, we introduce BeHonest, a pioneering benchmark specifically +designed to assess honesty in LLMs comprehensively. BeHonest evaluates three +essential aspects of honesty: awareness of knowledge boundaries, avoidance of +deceit, and consistency in responses. Building on this foundation, we designed +10 scenarios to evaluate and analyze 9 popular LLMs on the market, including +both closed-source and open-source models from different model families with +varied model sizes. Our findings indicate that there is still significant room +for improvement in the honesty of LLMs. We also encourage the AI community to +prioritize honesty alignment in LLMs. Our benchmark and code can be found at: +\url{https://github.com/GAIR-NLP/BeHonest}. + +
+
+
+
+
+ + ♻ ☆ Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated + Text + + +
+ Detecting text generated by modern large language models is thought to be +hard, as both LLMs and humans can exhibit a wide range of complex behaviors. +However, we find that a score based on contrasting two closely related language +models is highly accurate at separating human-generated and machine-generated +text. Based on this mechanism, we propose a novel LLM detector that only +requires simple calculations using a pair of pre-trained LLMs. The method, +called Binoculars, achieves state-of-the-art accuracy without any training +data. It is capable of spotting machine text from a range of modern LLMs +without any model-specific modifications. We comprehensively evaluate +Binoculars on a number of text sources and in varied situations. Over a wide +range of document types, Binoculars detects over 90% of generated samples from +ChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being +trained on any ChatGPT data. + +
+
+ comment: 20 pages, code available at https://github.com/ahans30/Binoculars +
+
+
+
+
+ + ♻ ☆ $R^3$-NL2GQL: A Model Coordination and Knowledge Graph Alignment + Approach for NL2GQL + + +
+ While current tasks of converting natural language to SQL (NL2SQL) using +Foundation Models have shown impressive achievements, adapting these approaches +for converting natural language to Graph Query Language (NL2GQL) encounters +hurdles due to the distinct nature of GQL compared to SQL, alongside the +diverse forms of GQL. Moving away from traditional rule-based and slot-filling +methodologies, we introduce a novel approach, $R^3$-NL2GQL, integrating both +small and large Foundation Models for ranking, rewriting, and refining tasks. +This method leverages the interpretative strengths of smaller models for +initial ranking and rewriting stages, while capitalizing on the superior +generalization and query generation prowess of larger models for the final +transformation of natural language queries into GQL formats. Addressing the +scarcity of datasets in this emerging field, we have developed a bilingual +dataset, sourced from graph database manuals and selected open-source Knowledge +Graphs (KGs). Our evaluation of this methodology on this dataset demonstrates +its promising efficacy and robustness. + +
+
+
+
+
+ + ♻ ☆ SCAR: Efficient Instruction-Tuning for Large Language Models via Style + Consistency-Aware Response Ranking + + +
+ Recent studies have shown that maintaining a consistent response style by +human experts and enhancing data quality in training sets can significantly +improve the performance of fine-tuned Large Language Models (LLMs) while +reducing the number of training examples needed. However, the precise +definition of style and the relationship between style, data quality, and LLM +performance remains unclear. This research decomposes response style into +presentation and composition styles and finds that, among training data of +similar quality, those with higher style consistency lead to better LLM +performance. Inspired by this, we introduce Style Consistency-Aware Response +Ranking (SCAR), which automatically prioritizes instruction-response pairs in +the training set based on their response stylistic consistency. By selecting +the most style-consistent examples, ranging from the top 25% to 0.7% of the +full dataset, the fine-tuned LLMs can match or even surpass the performance of +models trained on the entire dataset in coding and open-ended +question-answering benchmarks. Code and data are available at +https://github.com/zhuang-li/SCAR . + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ Rethinking LLM Memorization through the Lens of Adversarial Compression + + +
+ Large language models (LLMs) trained on web-scale datasets raise substantial +concerns regarding permissible data usage. One major question is whether these +models "memorize" all their training data or they integrate many data sources +in some way more akin to how a human would learn and synthesize information. +The answer hinges, to a large degree, on how we define memorization. In this +work, we propose the Adversarial Compression Ratio (ACR) as a metric for +assessing memorization in LLMs. A given string from the training data is +considered memorized if it can be elicited by a prompt (much) shorter than the +string itself -- in other words, if these strings can be "compressed" with the +model by computing adversarial prompts of fewer tokens. The ACR overcomes the +limitations of existing notions of memorization by (i) offering an adversarial +view of measuring memorization, especially for monitoring unlearning and +compliance; and (ii) allowing for the flexibility to measure memorization for +arbitrary strings at a reasonably low compute. Our definition serves as a +practical tool for determining when model owners may be violating terms around +data usage, providing a potential legal tool and a critical lens through which +to address such scenarios. + +
+
+ comment: https://locuslab.github.io/acr-memorization +
+
+
+
+
+ + ♻ ☆ $Classi|Q\rangle$ Towards a Translation Framework To Bridge The + Classical-Quantum Programming Gap + + +
+ Quantum computing, albeit readily available as hardware or emulated on the +cloud, is still far from being available in general regarding complex +programming paradigms and learning curves. This vision paper introduces +$Classi|Q\rangle$, a translation framework idea to bridge Classical and Quantum +Computing by translating high-level programming languages, e.g., Python or C++, +into a low-level language, e.g., Quantum Assembly. Our idea paper serves as a +blueprint for ongoing efforts in quantum software engineering, offering a +roadmap for further $Classi|Q\rangle$ development to meet the diverse needs of +researchers and practitioners. $Classi|Q\rangle$ is designed to empower +researchers and practitioners with no prior quantum experience to harness the +potential of hybrid quantum computation. We also discuss future enhancements to +$Classi|Q\rangle$, including support for additional quantum languages, improved +optimization strategies, and integration with emerging quantum computing +platforms. + +
+
+
+
+
+ + ♻ ☆ Efficient Prompt Tuning by Multi-Space Projection and Prompt Fusion + + +
+ Prompt tuning is a promising method to fine-tune a pre-trained language model +without retraining its large-scale parameters. Instead, it attaches a soft +prompt to the input text, whereby downstream tasks can be well adapted by +merely learning the embeddings of prompt tokens. Nevertheless, existing methods +still suffer from two challenges: (i) they are hard to balance accuracy and +efficiency. A longer (shorter) soft prompt generally leads to a better(worse) +accuracy but at the cost of more (less) training time. (ii)The performance may +not be consistent when adapting to different downstream tasks. We attribute it +to the same embedding space but responsible for different requirements of +downstream tasks. To address these issues, we propose an Efficient Prompt +Tuning method (EPT) by multi-space projection and prompt fusion. Specifically, +it decomposes a given soft prompt into a shorter prompt and two low-rank +matrices, significantly reducing the training time. Accuracy is also enhanced +by leveraging low-rank matrices and the short prompt as additional knowledge +sources to enrich the semantics of the original short prompt. In addition, we +project the soft prompt into multiple subspaces to improve the performance +consistency, and then adaptively learn the combination weights of different +spaces through a gating network. Experiments on 13 natural language processing +downstream tasks show that our method significantly and consistently +outperforms 11 comparison methods with the relative percentage of improvements +up to 12.9%, and training time decreased by 14%. + +
+
+
+
+
+ + ♻ ☆ Assessing Logical Reasoning Capabilities of Encoder-Only Transformer + Models + + +
+ Logical reasoning is central to complex human activities, such as thinking, +debating, and planning; it is also a central component of many AI systems as +well. In this paper, we investigate the extent to which encoder-only +transformer language models (LMs) can reason according to logical rules. We ask +whether those LMs can deduce theorems in propositional calculus and first-order +logic; if their relative success in these problems reflects general logical +capabilities; and which layers contribute the most to the task. First, we show +for several encoder-only LMs that they can be trained, to a reasonable degree, +to determine logical validity on various datasets. Next, by cross-probing +fine-tuned models on these datasets, we show that LMs have difficulty in +transferring their putative logical reasoning ability, which suggests that they +may have learned dataset-specific features, instead of a general capability. +Finally, we conduct a layerwise probing experiment, which shows that the +hypothesis classification task is mostly solved through higher layers. + +
+
+
+
+
+ + ♻ ☆ On the Hardness of Faithful Chain-of-Thought Reasoning in Large Language + Models + + +
+ As Large Language Models (LLMs) are increasingly being employed in real-world +applications in critical domains such as healthcare, it is important to ensure +that the Chain-of-Thought (CoT) reasoning generated by these models faithfully +captures their underlying behavior. + While LLMs are known to generate CoT reasoning that is appealing to humans, +prior studies have shown that these explanations do not accurately reflect the +actual behavior of the underlying LLMs. In this work, we explore the promise of +three broad approaches commonly employed to steer the behavior of LLMs to +enhance the faithfulness of the CoT reasoning generated by LLMs: in-context +learning, fine-tuning, and activation editing. Specifically, we introduce novel +strategies for in-context learning, fine-tuning, and activation editing aimed +at improving the faithfulness of the CoT reasoning. We then carry out extensive +empirical analyses with multiple benchmark datasets to explore the promise of +these strategies. Our analyses indicate that these strategies offer limited +success in improving the faithfulness of the CoT reasoning, with only slight +performance enhancements in controlled scenarios. Activation editing +demonstrated minimal success, while fine-tuning and in-context learning +achieved marginal improvements that failed to generalize across diverse +reasoning and truthful question-answering benchmarks. In summary, our work +underscores the inherent difficulty in eliciting faithful CoT reasoning from +LLMs, suggesting that the current array of approaches may not be sufficient to +address this complex challenge. + +
+
+
+
+
+ + ♻ ☆ First-Step Advantage: Importance of Starting Right in Multi-Step Math + Reasoning + + +
+ Language models can solve complex reasoning tasks better by learning to +generate rationales for their predictions. Often these models know how to solve +a task but their auto-regressive decoding nature leads to incorrect results if +they start incorrectly. We observe that smaller models in particular when +corrected, can solve a task that they would have otherwise struggled with. We +demonstrate this phenomenon by using a larger model to guide smaller models, +which leads to significantly improved performance (up to +24 points on the +GSM8K dataset by 7B models). To assist smaller models in initiating the +starting step, we propose QuestCoT, where a smaller model first asks itself how +to start, before proceeding with a chain of reasoning. On various multistep +mathematical reasoning datasets over multiple smaller models, we show that +getting the right start can lead to significant performance gains across all +models (gains of up to +6 points on GSM8K, +9 on SVAMP, +5 on ASDiv, and +7 on +MultiArith). + +
+
+
+
+
+ + ♻ ☆ Model Generation with LLMs: From Requirements to UML Sequence Diagrams + + +
+ Complementing natural language (NL) requirements with graphical models can +improve stakeholders' communication and provide directions for system design. +However, creating models from requirements involves manual effort. The advent +of generative large language models (LLMs), ChatGPT being a notable example, +offers promising avenues for automated assistance in model generation. This +paper investigates the capability of ChatGPT to generate a specific type of +model, i.e., UML sequence diagrams, from NL requirements. We conduct a +qualitative study in which we examine the sequence diagrams generated by +ChatGPT for 28 requirements documents of various types and from different +domains. Observations from the analysis of the generated diagrams have +systematically been captured through evaluation logs, and categorized through +thematic analysis. Our results indicate that, although the models generally +conform to the standard and exhibit a reasonable level of understandability, +their completeness and correctness with respect to the specified requirements +often present challenges. This issue is particularly pronounced in the presence +of requirements smells, such as ambiguity and inconsistency. The insights +derived from this study can influence the practical utilization of LLMs in the +RE process, and open the door to novel RE-specific prompting strategies +targeting effective model generation. + +
+
+
+
+
+ + ♻ ☆ Recovering the Pre-Fine-Tuning Weights of Generative Models ICML 2024 + + +
+ The dominant paradigm in generative modeling consists of two steps: i) +pre-training on a large-scale but unsafe dataset, ii) aligning the pre-trained +model with human values via fine-tuning. This practice is considered safe, as +no current method can recover the unsafe, pre-fine-tuning model weights. In +this paper, we demonstrate that this assumption is often false. Concretely, we +present Spectral DeTuning, a method that can recover the weights of the +pre-fine-tuning model using a few low-rank (LoRA) fine-tuned models. In +contrast to previous attacks that attempt to recover pre-fine-tuning +capabilities, our method aims to recover the exact pre-fine-tuning weights. Our +approach exploits this new vulnerability against large-scale models such as a +personalized Stable Diffusion and an aligned Mistral. + +
+
+ comment: ICML 2024. Project page: https://vision.huji.ac.il/spectral_detuning/ +
+
+
+
+
+ + ♻ ☆ Model Internals-based Answer Attribution for Trustworthy + Retrieval-Augmented Generation + + +
+ Ensuring the verifiability of model answers is a fundamental challenge for +retrieval-augmented generation (RAG) in the question answering (QA) domain. +Recently, self-citation prompting was proposed to make large language models +(LLMs) generate citations to supporting documents along with their answers. +However, self-citing LLMs often struggle to match the required format, refer to +non-existent sources, and fail to faithfully reflect LLMs' context usage +throughout the generation. In this work, we present MIRAGE --Model +Internals-based RAG Explanations -- a plug-and-play approach using model +internals for faithful answer attribution in RAG applications. MIRAGE detects +context-sensitive answer tokens and pairs them with retrieved documents +contributing to their prediction via saliency methods. We evaluate our proposed +approach on a multilingual extractive QA dataset, finding high agreement with +human answer attribution. On open-ended QA, MIRAGE achieves citation quality +and efficiency comparable to self-citation while also allowing for a +finer-grained control of attribution parameters. Our qualitative evaluation +highlights the faithfulness of MIRAGE's attributions and underscores the +promising application of model internals for RAG answer attribution. + +
+
+ comment: Under review. Code and data released at + https://github.com/Betswish/MIRAGE +
+
+
+
+
+ + ♻ ☆ Paraphrase Types for Generation and Detection EMNLP 2023 + + +
+ Current approaches in paraphrase generation and detection heavily rely on a +single general similarity score, ignoring the intricate linguistic properties +of language. This paper introduces two new tasks to address this shortcoming by +considering paraphrase types - specific linguistic perturbations at particular +text positions. We name these tasks Paraphrase Type Generation and Paraphrase +Type Detection. Our results suggest that while current techniques perform well +in a binary classification scenario, i.e., paraphrased or not, the inclusion of +fine-grained paraphrase types poses a significant challenge. While most +approaches are good at generating and detecting general semantic similar +content, they fail to understand the intrinsic linguistic variables they +manipulate. Models trained in generating and identifying paraphrase types also +show improvements in tasks without them. In addition, scaling these models +further improves their ability to understand paraphrase types. We believe +paraphrase types can unlock a new paradigm for developing paraphrase models and +solving tasks in the future. + +
+
+ comment: Published at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ We are Who We Cite: Bridges of Influence Between Natural Language + Processing and Other Academic Fields EMNLP 2023 + + +
+ Natural Language Processing (NLP) is poised to substantially influence the +world. However, significant progress comes hand-in-hand with substantial risks. +Addressing them requires broad engagement with various fields of study. Yet, +little empirical work examines the state of such engagement (past or current). +In this paper, we quantify the degree of influence between 23 fields of study +and NLP (on each other). We analyzed ~77k NLP papers, ~3.1m citations from NLP +papers to other papers, and ~1.8m citations from other papers to NLP papers. We +show that, unlike most fields, the cross-field engagement of NLP, measured by +our proposed Citation Field Diversity Index (CFDI), has declined from 0.58 in +1980 to 0.31 in 2022 (an all-time low). In addition, we find that NLP has grown +more insular -- citing increasingly more NLP papers and having fewer papers +that act as bridges between fields. NLP citations are dominated by computer +science; Less than 8% of NLP citations are to linguistics, and less than 3% are +to math and psychology. These findings underscore NLP's urgent need to reflect +on its engagement with various fields. + +
+
+ comment: Published at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ The Elephant in the Room: Analyzing the Presence of Big Tech in Natural + Language Processing Research ACL 2023 + + +
+ Recent advances in deep learning methods for natural language processing +(NLP) have created new business opportunities and made NLP research critical +for industry development. As one of the big players in the field of NLP, +together with governments and universities, it is important to track the +influence of industry on research. In this study, we seek to quantify and +characterize industry presence in the NLP community over time. Using a corpus +with comprehensive metadata of 78,187 NLP publications and 701 resumes of NLP +publication authors, we explore the industry presence in the field since the +early 90s. We find that industry presence among NLP authors has been steady +before a steep increase over the past five years (180% growth from 2017 to +2022). A few companies account for most of the publications and provide funding +to academic researchers through grants and internships. Our study shows that +the presence and impact of the industry on natural language processing research +are significant and fast-growing. This work calls for increased transparency of +industry influence in the field. + +
+
+ comment: Published at ACL 2023 +
+
+
+
+
+ + ♻ ☆ Performance of large language models in numerical vs. semantic medical + knowledge: Benchmarking on evidence-based Q&As + + +
+ Clinical problem-solving requires processing of semantic medical knowledge +such as illness scripts and numerical medical knowledge of diagnostic tests for +evidence-based decision-making. As large language models (LLMs) show promising +results in many aspects of language-based clinical practice, their ability to +generate non-language evidence-based answers to clinical questions is +inherently limited by tokenization. Therefore, we evaluated LLMs' performance +on two question types: numeric (correlating findings) and semantic +(differentiating entities) while examining differences within and between LLMs +in medical aspects and comparing their performance to humans. To generate +straightforward multi-choice questions and answers (QAs) based on +evidence-based medicine (EBM), we used a comprehensive medical knowledge graph +(encompassed data from more than 50,00 peer-reviewed articles) and created the +"EBMQA". EBMQA contains 105,000 QAs labeled with medical and non-medical topics +and classified into numerical or semantic questions. We benchmarked this +dataset using more than 24,500 QAs on two state-of-the-art LLMs: Chat-GPT4 and +Claude3-Opus. We evaluated the LLMs accuracy on semantic and numerical question +types and according to sub-labeled topics. For validation, six medical experts +were tested on 100 numerical EBMQA questions. We found that both LLMs excelled +more in semantic than numerical QAs, with Claude3 surpassing GPT4 in numerical +QAs. However, both LLMs showed inter and intra gaps in different medical +aspects and remained inferior to humans. Thus, their medical advice should be +addressed carefully. + +
+
+
+
+
+ + ♻ ☆ ProTrix: Building Models for Planning and Reasoning over Tables with + Sentence Context + + +
+ Tables play a crucial role in conveying information in various domains. We +propose a Plan-then-Reason framework to answer different types of user queries +over tables with sentence context. The framework first plans the reasoning +paths over the context, then assigns each step to program-based or textual +reasoning to reach the final answer. This framework enhances the table +reasoning abilities for both in-context learning and fine-tuning methods. +GPT-3.5-Turbo following Plan-then-Reason framework surpasses other prompting +baselines without self-consistency while using less API calls and in-context +demonstrations. We also construct an instruction tuning set TrixInstruct to +evaluate the effectiveness of fine-tuning with this framework. We present +ProTrix model family by finetuning models on TrixInstruct. Our experiments show +that ProTrix family generalizes to diverse unseen tabular tasks with only 6k +training instances. We further demonstrate that ProTrix can generate accurate +and faithful explanations to answer complex free-form questions. Our work +underscores the importance of the planning and reasoning abilities towards a +model over tabular tasks with generalizability and interpretability. We +open-source our dataset and models at https://github.com/WilliamZR/ProTrix. + +
+
+
+
+
+ + ♻ ☆ Climate Change from Large Language Models + + +
+ Climate change poses grave challenges, demanding widespread understanding and +low-carbon lifestyle awareness. Large language models (LLMs) offer a powerful +tool to address this crisis, yet comprehensive evaluations of their +climate-crisis knowledge are lacking. This paper proposes an automated +evaluation framework to assess climate-crisis knowledge within LLMs. We adopt a +hybrid approach for data acquisition, combining data synthesis and manual +collection, to compile a diverse set of questions encompassing various aspects +of climate change. Utilizing prompt engineering based on the compiled +questions, we evaluate the model's knowledge by analyzing its generated +answers. Furthermore, we introduce a comprehensive set of metrics to assess +climate-crisis knowledge, encompassing indicators from 10 distinct +perspectives. These metrics provide a multifaceted evaluation, enabling a +nuanced understanding of the LLMs' climate crisis comprehension. The +experimental results demonstrate the efficacy of our proposed method. In our +evaluation utilizing diverse high-performing LLMs, we discovered that while +LLMs possess considerable climate-related knowledge, there are shortcomings in +terms of timeliness, indicating a need for continuous updating and refinement +of their climate-related content. + +
+
+
+
+
+ + ♻ ☆ Improving Retrieval Augmented Open-Domain Question-Answering with + Vectorized Contexts ACL2023 + + +
+ In the era of large language models, applying techniques such as Retrieval +Augmented Generation can better address Open-Domain Question-Answering +problems. Due to constraints including model sizes and computing resources, the +length of context is often limited, and it becomes challenging to empower the +model to cover overlong contexts while answering questions from open domains. +This paper proposes a general and convenient method to covering longer contexts +in Open-Domain Question-Answering tasks. It leverages a small encoder language +model that effectively encodes contexts, and the encoding applies +cross-attention with origin inputs. With our method, the origin language models +can cover several times longer contexts while keeping the computing +requirements close to the baseline. Our experiments demonstrate that after +fine-tuning, there is improved performance across two held-in datasets, four +held-out datasets, and also in two In Context Learning settings. + +
+
+ comment: ACL2023 Findings +
+
+
+
+
+ + ♻ ☆ Mimicking User Data: On Mitigating Fine-Tuning Risks in Closed Large + Language Models + + +
+ Fine-tuning large language models on small, high-quality datasets can enhance +their performance on specific downstream tasks. Recent research shows that +fine-tuning on benign, instruction-following data can inadvertently undo the +safety alignment process and increase a model's propensity to comply with +harmful queries. Although critical, understanding and mitigating safety risks +in well-defined tasks remains distinct from the instruction-following context +due to structural differences in the data. Our work addresses the gap in our +understanding of these risks across diverse types of data in closed models - +where providers control how user data is utilized in the fine-tuning process. +We demonstrate how malicious actors can subtly manipulate the structure of +almost any task-specific dataset to foster significantly more dangerous model +behaviors, while maintaining an appearance of innocuity and reasonable +downstream task performance. To address this issue, we propose a novel +mitigation strategy that mixes in safety data which mimics the task format and +prompting style of the user data, showing this is more effective than existing +baselines at re-establishing safety alignment while maintaining similar task +performance. + +
+
+
+
+
+ + ♻ ☆ The Potential and Challenges of Evaluating Attitudes, Opinions, and + Values in Large Language Models + + +
+ Recent advances in Large Language Models (LLMs) have sparked wide interest in +validating and comprehending the human-like cognitive-behavioral traits LLMs +may have. These cognitive-behavioral traits include typically Attitudes, +Opinions, Values (AOV). However, measuring AOV embedded within LLMs remains +opaque, and different evaluation methods may yield different results. This has +led to a lack of clarity on how different studies are related to each other and +how they can be interpreted. This paper aims to bridge this gap by providing an +overview of recent works on the evaluation of AOV in LLMs. Moreover, we survey +related approaches in different stages of the evaluation pipeline in these +works. By doing so, we address the potential and challenges with respect to +understanding the model, human-AI alignment, and downstream application in +social sciences. Finally, we provide practical insights into evaluation +methods, model enhancement, and interdisciplinary collaboration, thereby +contributing to the evolving landscape of evaluating AOV in LLMs. + +
+
+
+
+
+ + ♻ ☆ CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay ICML'24 + + +
+ Large language models are increasingly solving tasks that are commonly +believed to require human-level reasoning ability. However, these models still +perform very poorly on benchmarks of general intelligence such as the +Abstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a +programming-by-examples problem, and introduce a novel and scalable method for +language model self-improvement called Code Iteration (CodeIt). Our method +iterates between 1) program sampling and hindsight relabeling, and 2) learning +from prioritized experience replay. By relabeling the goal of an episode (i.e., +the target program output given input) to the realized output produced by the +sampled program, our method effectively deals with the extreme sparsity of +rewards in program synthesis. Applying CodeIt to the ARC dataset, we +demonstrate that prioritized hindsight replay, along with pre-training and +data-augmentation, leads to successful inter-task generalization. CodeIt is the +first neuro-symbolic approach that scales to the full ARC evaluation dataset. +Our method solves 15% of ARC evaluation tasks, achieving state-of-the-art +performance and outperforming existing neural and symbolic baselines. Our code +is available at https://github.com/Qualcomm-AI-research/codeit . + +
+
+ comment: ICML'24 camera-ready version +
+
+
+
+
+ + ♻ ☆ CoCoST: Automatic Complex Code Generation with Online Searching and + Correctness Testing + + +
+ Large Language Models have revolutionized code generation ability by +converting natural language descriptions into executable code. However, +generating complex code within real-world scenarios remains challenging due to +intricate structures, subtle bugs, understanding of advanced data types, and +lack of supplementary contents. To address these challenges, we introduce the +CoCoST framework, which enhances complex code generation by online searching +for more information with planned queries and correctness testing for code +refinement. Moreover, CoCoST serializes the complex inputs and outputs to +improve comprehension and generates test cases to ensure the adaptability for +real-world applications. CoCoST is validated through rigorous experiments on +the DS-1000 and ClassEval datasets. Experimental results show that CoCoST +substantially improves the quality of complex code generation, highlighting its +potential to enhance the practicality of LLMs in generating complex code. + +
+
+
+
+
+ + ♻ ☆ Exploring the Potential of Large Language Models in Computational + Argumentation ACL 2024 + + +
+ Computational argumentation has become an essential tool in various domains, +including law, public policy, and artificial intelligence. It is an emerging +research field in natural language processing that attracts increasing +attention. Research on computational argumentation mainly involves two types of +tasks: argument mining and argument generation. As large language models (LLMs) +have demonstrated impressive capabilities in understanding context and +generating natural language, it is worthwhile to evaluate the performance of +LLMs on diverse computational argumentation tasks. This work aims to embark on +an assessment of LLMs, such as ChatGPT, Flan models, and LLaMA2 models, in both +zero-shot and few-shot settings. We organize existing tasks into six main +categories and standardize the format of fourteen openly available datasets. In +addition, we present a new benchmark dataset on counter speech generation that +aims to holistically evaluate the end-to-end performance of LLMs on argument +mining and argument generation. Extensive experiments show that LLMs exhibit +commendable performance across most of the datasets, demonstrating their +capabilities in the field of argumentation. Our analysis offers valuable +suggestions for evaluating computational argumentation and its integration with +LLMs in future research endeavors. + +
+
+ comment: Accepted at ACL 2024 Main +
+
+
+
+
+ + ♻ ☆ Compress to Impress: Unleashing the Potential of Compressive Memory in + Real-World Long-Term Conversations + + +
+ Existing retrieval-based methods have made significant strides in maintaining +long-term conversations. However, these approaches face challenges in memory +database management and accurate memory retrieval, hindering their efficacy in +dynamic, real-world interactions. This study introduces a novel framework, +COmpressive Memory-Enhanced Dialogue sYstems (COMEDY), which eschews +traditional retrieval modules and memory databases. Instead, COMEDY adopts a +"One-for-All" approach, utilizing a single language model to manage memory +generation, compression, and response generation. Central to this framework is +the concept of compressive memory, which intergrates session-specific +summaries, user-bot dynamics, and past events into a concise memory format. To +support COMEDY, we curated a large-scale Chinese instruction-tuning dataset, +Dolphin, derived from real user-chatbot interactions. Comparative evaluations +demonstrate COMEDY's superiority over traditional retrieval-based methods in +producing more nuanced and human-like conversational experiences. Our codes are +available at https://github.com/nuochenpku/COMEDY. + +
+
+ comment: 17pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Textual Similarity as a Key Metric in Machine Translation Quality + Estimation + + +
+ Machine Translation (MT) Quality Estimation (QE) assesses translation +reliability without reference texts. This study introduces "textual similarity" +as a new metric for QE, using sentence transformers and cosine similarity to +measure semantic closeness. Analyzing data from the MLQE-PE dataset, we found +that textual similarity exhibits stronger correlations with human scores than +traditional metrics (hter, model evaluation, sentence probability etc.). +Employing GAMMs as a statistical tool, we demonstrated that textual similarity +consistently outperforms other metrics across multiple language pairs in +predicting human scores. We also found that "hter" actually failed to predict +human scores in QE. Our findings highlight the effectiveness of textual +similarity as a robust QE metric, recommending its integration with other +metrics into QE frameworks and MT system training for improved accuracy and +usability. + +
+
+
+
+
+ + ♻ ☆ Revealing User Familiarity Bias in Task-Oriented Dialogue via + Interactive Evaluation ACL 2024 + + +
+ Most task-oriented dialogue (TOD) benchmarks assume users that know exactly +how to use the system by constraining the user behaviors within the system's +capabilities via strict user goals, namely "user familiarity" bias. This data +bias deepens when it combines with data-driven TOD systems, as it is impossible +to fathom the effect of it with existing static evaluations. Hence, we conduct +an interactive user study to unveil how vulnerable TOD systems are against +realistic scenarios. In particular, we compare users with 1) detailed goal +instructions that conform to the system boundaries (closed-goal) and 2) vague +goal instructions that are often unsupported but realistic (open-goal). Our +study reveals that conversations in open-goal settings lead to catastrophic +failures of the system, in which 92% of the dialogues had significant issues. +Moreover, we conduct a thorough analysis to identify distinctive features +between the two settings through error annotation. From this, we discover a +novel "pretending" behavior, in which the system pretends to handle the user +requests even though they are beyond the system's capabilities. We discuss its +characteristics and toxicity while showing recent large language models can +also suffer from this behavior. + +
+
+ comment: NLP4ConvAI Workshop at ACL 2024 +
+
+
+
+
+ + ♻ ☆ GraphWiz: An Instruction-Following Language Model for Graph Problems + + +
+ Large language models (LLMs) have achieved impressive success across several +fields, but their proficiency in understanding and resolving complex graph +problems is less explored. To bridge this gap, we introduce GraphInstruct, a +novel and comprehensive instruction-tuning dataset designed to equip language +models with the ability to tackle a broad spectrum of graph problems using +explicit reasoning paths. Utilizing GraphInstruct, we build GraphWiz, an +open-source language model capable of resolving various graph problem types +while generating clear reasoning processes. To enhance the model's capability +and reliability, we incorporate the Direct Preference Optimization (DPO) +framework into the graph problem-solving context. The enhanced model, +GraphWiz-DPO, achieves an average accuracy of 65% across nine tasks with +different complexity levels, surpassing GPT-4 which has an average accuracy of +43.8%. Moreover, our research delves into the delicate balance between training +data volume and model performance, highlighting the potential for overfitting +with increased data. We also explore the transferability of the model's +reasoning ability across different graph tasks, indicating the model's +adaptability and practical application potential. Our investigation offers a +new blueprint and valuable insights for developing LLMs specialized in graph +reasoning and problem-solving. + +
+
+ comment: 27pages, 15 tables +
+
+
+
+
+ + ♻ ☆ How Reliable Are Automatic Evaluation Methods for Instruction-Tuned + LLMs? + + +
+ Work on instruction-tuned Large Language Models (LLMs) has used automatic +methods based on text overlap and LLM judgments as cost-effective alternatives +to human evaluation. In this paper, we perform a meta-evaluation of such +methods and assess their reliability across a broad range of tasks. We observe +that while automatic evaluation methods can approximate human ratings under +specific conditions, their validity is highly context-dependent. Specifically, +the simple ROUGE-L metric correlates well with human ratings for short-answer +English tasks but is unreliable in free-form generation tasks and cross-lingual +transfer. The effectiveness of the more advanced method of using GPT-4 as a +judge diminishes significantly if reference answers are not included in the +prompt, which is the scenario where this method has the potential to provide +the most value compared to other metrics. Our findings enhance the +understanding of how automatic methods should be applied and interpreted when +developing and evaluating instruction-tuned LLMs. + +
+
+
+
+
+ + ♻ ☆ Is one brick enough to break the wall of spoken dialogue state tracking? + + +
+ In Task-Oriented Dialogue (TOD) systems, correctly updating the system's +understanding of the user's requests (\textit{a.k.a} dialogue state tracking) +is key to a smooth interaction. Traditionally, TOD systems perform this update +in three steps: transcription of the user's utterance, semantic extraction of +the key concepts, and contextualization with the previously identified +concepts. Such cascade approaches suffer from cascading errors and separate +optimization. End-to-End approaches have been proven helpful up to the +turn-level semantic extraction step. This paper goes one step further and +provides (1) a novel approach for completely neural spoken DST, (2) an in depth +comparison with a state of the art cascade approach and (3) avenues towards +better context propagation. Our study highlights that jointly-optimized +approaches are also competitive for contextually dependent tasks, such as +Dialogue State Tracking (DST), especially in audio native settings. Context +propagation in DST systems could benefit from training procedures accounting +for the previous' context inherent uncertainty. + +
+
+
+
+
+ + ♻ ☆ Evaluating Copyright Takedown Methods for Language Models + + +
+ Language models (LMs) derive their capabilities from extensive training on +diverse data, including potentially copyrighted material. These models can +memorize and generate content similar to their training data, posing potential +concerns. Therefore, model creators are motivated to develop mitigation methods +that prevent generating protected content. We term this procedure as copyright +takedowns for LMs, noting the conceptual similarity to (but legal distinction +from) the DMCA takedown This paper introduces the first evaluation of the +feasibility and side effects of copyright takedowns for LMs. We propose +CoTaEval, an evaluation framework to assess the effectiveness of copyright +takedown methods, the impact on the model's ability to retain uncopyrightable +factual knowledge from the training data whose recitation is embargoed, and how +well the model maintains its general utility and efficiency. We examine several +strategies, including adding system prompts, decoding-time filtering +interventions, and unlearning approaches. Our findings indicate that no tested +method excels across all metrics, showing significant room for research in this +unique problem setting and indicating potential unresolved challenges for live +policy proposals. + +
+
+ comment: 31 pages, 9 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Assessing the Brittleness of Safety Alignment via Pruning and Low-Rank + Modifications + + +
+ Large language models (LLMs) show inherent brittleness in their safety +mechanisms, as evidenced by their susceptibility to jailbreaking and even +non-malicious fine-tuning. This study explores this brittleness of safety +alignment by leveraging pruning and low-rank modifications. We develop methods +to identify critical regions that are vital for safety guardrails, and that are +disentangled from utility-relevant regions at both the neuron and rank levels. +Surprisingly, the isolated regions we find are sparse, comprising about $3\%$ +at the parameter level and $2.5\%$ at the rank level. Removing these regions +compromises safety without significantly impacting utility, corroborating the +inherent brittleness of the model's safety mechanisms. Moreover, we show that +LLMs remain vulnerable to low-cost fine-tuning attacks even when modifications +to the safety-critical regions are restricted. These findings underscore the +urgent need for more robust safety strategies in LLMs. + +
+
+ comment: 22 pages, 9 figures. Project page is available at + https://boyiwei.com/alignment-attribution/ +
+
+
+
+
+ + ♻ ☆ Benchmarking Mental State Representations in Language Models ICML 2024 + + +
+ While numerous works have assessed the generative performance of language +models (LMs) on tasks requiring Theory of Mind reasoning, research into the +models' internal representation of mental states remains limited. Recent work +has used probing to demonstrate that LMs can represent beliefs of themselves +and others. However, these claims are accompanied by limited evaluation, making +it difficult to assess how mental state representations are affected by model +design and training choices. We report an extensive benchmark with various LM +types with different model sizes, fine-tuning approaches, and prompt designs to +study the robustness of mental state representations and memorisation issues +within the probes. Our results show that the quality of models' internal +representations of the beliefs of others increases with model size and, more +crucially, with fine-tuning. We are the first to study how prompt variations +impact probing performance on theory of mind tasks. We demonstrate that models' +representations are sensitive to prompt variations, even when such variations +should be beneficial. Finally, we complement previous activation editing +experiments on Theory of Mind tasks and show that it is possible to improve +models' reasoning performance by steering their activations without the need to +train any probe. + +
+
+ comment: ICML 2024 Workshop on Mechanistic Interpretability +
+
+
+
+
+ + ♻ ☆ SeaLLMs -- Large Language Models for Southeast Asia ACL 2024 + + +
+ Despite the remarkable achievements of large language models (LLMs) in +various tasks, there remains a linguistic bias that favors high-resource +languages, such as English, often at the expense of low-resource and regional +languages. To address this imbalance, we introduce SeaLLMs, an innovative +series of language models that specifically focuses on Southeast Asian (SEA) +languages. SeaLLMs are built upon the Llama-2 model and further advanced +through continued pre-training with an extended vocabulary, specialized +instruction and alignment tuning to better capture the intricacies of regional +languages. This allows them to respect and reflect local cultural norms, +customs, stylistic preferences, and legal considerations. Our comprehensive +evaluation demonstrates that SeaLLM-13b models exhibit superior performance +across a wide spectrum of linguistic tasks and assistant-style +instruction-following capabilities relative to comparable open-source models. +Moreover, they outperform ChatGPT-3.5 in non-Latin languages, such as Thai, +Khmer, Lao, and Burmese, by large margins while remaining lightweight and +cost-effective to operate. + +
+
+ comment: Technical report, ACL 2024 DEMO TRACK +
+
+
+
+
+ + ♻ ☆ RouteLLM: Learning to Route LLMs with Preference Data + + +
+ Large language models (LLMs) exhibit impressive capabilities across a wide +range of tasks, yet the choice of which model to use often involves a trade-off +between performance and cost. More powerful models, though effective, come with +higher expenses, while less capable models are more cost-effective. To address +this dilemma, we propose several efficient router models that dynamically +select between a stronger and a weaker LLM during inference, aiming to optimize +the balance between cost and response quality. We develop a training framework +for these routers leveraging human preference data and data augmentation +techniques to enhance performance. Our evaluation on widely-recognized +benchmarks shows that our approach significantly reduces costs-by over 2 times +in certain cases-without compromising the quality of responses. Interestingly, +our router models also demonstrate significant transfer learning capabilities, +maintaining their performance even when the strong and weak models are changed +at test time. This highlights the potential of these routers to provide a +cost-effective yet high-performance solution for deploying LLMs. + +
+
+
+
+
+ + ♻ ☆ KoLA: Carefully Benchmarking World Knowledge of Large Language Models ICLR 2024 + + +
+ The unprecedented performance of large language models (LLMs) necessitates +improvements in evaluations. Rather than merely exploring the breadth of LLM +abilities, we believe meticulous and thoughtful designs are essential to +thorough, unbiased, and applicable evaluations. Given the importance of world +knowledge to LLMs, we construct a Knowledge-oriented LLM Assessment benchmark +(KoLA), in which we carefully design three crucial factors: (1) For +\textbf{ability modeling}, we mimic human cognition to form a four-level +taxonomy of knowledge-related abilities, covering $19$ tasks. (2) For +\textbf{data}, to ensure fair comparisons, we use both Wikipedia, a corpus +prevalently pre-trained by LLMs, along with continuously collected emerging +corpora, aiming to evaluate the capacity to handle unseen data and evolving +knowledge. (3) For \textbf{evaluation criteria}, we adopt a contrastive system, +including overall standard scores for better numerical comparability across +tasks and models and a unique self-contrast metric for automatically evaluating +knowledge-creating ability. We evaluate $28$ open-source and commercial LLMs +and obtain some intriguing findings. The KoLA dataset and open-participation +leaderboard are publicly released at https://kola.xlore.cn and will be +continuously updated to provide references for developing LLMs and +knowledge-related systems. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ WaterBench: Towards Holistic Evaluation of Watermarks for Large Language + Models ACL 2024 + + +
+ To mitigate the potential misuse of large language models (LLMs), recent +research has developed watermarking algorithms, which restrict the generation +process to leave an invisible trace for watermark detection. Due to the +two-stage nature of the task, most studies evaluate the generation and +detection separately, thereby presenting a challenge in unbiased, thorough, and +applicable evaluations. In this paper, we introduce WaterBench, the first +comprehensive benchmark for LLM watermarks, in which we design three crucial +factors: (1) For benchmarking procedure, to ensure an apples-to-apples +comparison, we first adjust each watermarking method's hyper-parameter to reach +the same watermarking strength, then jointly evaluate their generation and +detection performance. (2) For task selection, we diversify the input and +output length to form a five-category taxonomy, covering $9$ tasks. (3) For +evaluation metric, we adopt the GPT4-Judge for automatically evaluating the +decline of instruction-following abilities after watermarking. We evaluate $4$ +open-source watermarks on $2$ LLMs under $2$ watermarking strengths and observe +the common struggles for current methods on maintaining the generation quality. +The code and data are available at https://github.com/THU-KEG/WaterBench. + +
+
+ comment: 26pages, 7 figures, accepted by ACL 2024 +
+
+
+
+
+ + ♻ ☆ Don't Hallucinate, Abstain: Identifying LLM Knowledge Gaps via Multi-LLM + Collaboration ACL 2024 + + +
+ Despite efforts to expand the knowledge of large language models (LLMs), +knowledge gaps -- missing or outdated information in LLMs -- might always +persist given the evolving nature of knowledge. In this work, we study +approaches to identify LLM knowledge gaps and abstain from answering questions +when knowledge gaps are present. We first adapt existing approaches to model +calibration or adaptation through fine-tuning/prompting and analyze their +ability to abstain from generating low-confidence outputs. Motivated by their +failures in self-reflection and over-reliance on held-out sets, we propose two +novel approaches that are based on model collaboration, i.e., LLMs probing +other LLMs for knowledge gaps, either cooperatively or competitively. Extensive +experiments with three LLMs on four QA tasks featuring diverse knowledge +domains demonstrate that both cooperative and competitive approaches to +unveiling LLM knowledge gaps achieve up to 19.3% improvements on abstain +accuracy against the strongest baseline. Further analysis reveals that our +proposed mechanisms could help identify failure cases in retrieval augmentation +and pinpoint knowledge gaps in multi-hop reasoning. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Action Controlled Paraphrasing + + +
+ Recent studies have demonstrated the potential to control paraphrase +generation, such as through syntax, which has broad applications in various +downstream tasks. However, these methods often require detailed parse trees or +syntactic exemplars, countering human-like paraphrasing behavior in language +use. Furthermore, an inference gap exists, as control specifications are only +available during training but not during inference. In this work, we propose a +new setup for controlled paraphrase generation. Specifically, we represent user +intent as action tokens, embedding and concatenating them with text embeddings, +thus flowing together into a self-attention encoder for representation fusion. +To address the inference gap, we introduce an optional action token as a +placeholder that encourages the model to determine the appropriate action +independently when users' intended actions are not provided. Experimental +results show that our method successfully enables precise action-controlled +paraphrasing and preserves or even enhances performance compared to +conventional uncontrolled methods when actions are not given. Our findings +promote the concept of action-controlled paraphrasing for a more user-centered +design. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ♻ ☆ Quantifying Language Models' Sensitivity to Spurious Features in Prompt + Design or: How I learned to start worrying about prompt formatting ICLR 2024 + + +
+ As large language models (LLMs) are adopted as a fundamental component of +language technologies, it is crucial to accurately characterize their +performance. Because choices in prompt design can strongly influence model +behavior, this design process is critical in effectively using any modern +pre-trained generative language model. In this work, we focus on LLM +sensitivity to a quintessential class of meaning-preserving design choices: +prompt formatting. We find that several widely used open-source LLMs are +extremely sensitive to subtle changes in prompt formatting in few-shot +settings, with performance differences of up to 76 accuracy points when +evaluated using LLaMA-2-13B. Sensitivity remains even when increasing model +size, the number of few-shot examples, or performing instruction tuning. Our +analysis suggests that work evaluating LLMs with prompting-based methods would +benefit from reporting a range of performance across plausible prompt formats, +instead of the currently-standard practice of reporting performance on a single +format. We also show that format performance only weakly correlates between +models, which puts into question the methodological validity of comparing +models with an arbitrarily chosen, fixed prompt format. To facilitate +systematic analysis we propose FormatSpread, an algorithm that rapidly +evaluates a sampled set of plausible prompt formats for a given task, and +reports the interval of expected performance without accessing model weights. +Furthermore, we present a suite of analyses that characterize the nature of +this sensitivity, including exploring the influence of particular atomic +perturbations and the internal representation of particular formats. + +
+
+ comment: ICLR 2024 Camera Ready version. With respect to the original + submission, we added text generation experiments, plots of entire accuracy + distributions for each task + stdev computations, and prompt length + correlation with spread analysis +
+
+
+
+
+ + ♻ ☆ Artificial Leviathan: Exploring Social Evolution of LLM Agents Through + the Lens of Hobbesian Social Contract Theory + + +
+ The emergence of Large Language Models (LLMs) and advancements in Artificial +Intelligence (AI) offer an opportunity for computational social science +research at scale. Building upon prior explorations of LLM agent design, our +work introduces a simulated agent society where complex social relationships +dynamically form and evolve over time. Agents are imbued with psychological +drives and placed in a sandbox survival environment. We conduct an evaluation +of the agent society through the lens of Thomas Hobbes's seminal Social +Contract Theory (SCT). We analyze whether, as the theory postulates, agents +seek to escape a brutish "state of nature" by surrendering rights to an +absolute sovereign in exchange for order and security. Our experiments unveil +an alignment: Initially, agents engage in unrestrained conflict, mirroring +Hobbes's depiction of the state of nature. However, as the simulation +progresses, social contracts emerge, leading to the authorization of an +absolute sovereign and the establishment of a peaceful commonwealth founded on +mutual cooperation. This congruence between our LLM agent society's +evolutionary trajectory and Hobbes's theoretical account indicates LLMs' +capability to model intricate social dynamics and potentially replicate forces +that shape human societies. By enabling such insights into group behavior and +emergent societal phenomena, LLM-driven multi-agent simulations, while unable +to simulate all the nuances of human behavior, may hold potential for advancing +our understanding of social structures, group dynamics, and complex human +systems. + +
+
+
+
+
+ + ♻ ☆ Robust Stance Detection: Understanding Public Perceptions in Social + Media + + +
+ The abundance of social media data has presented opportunities for accurately +determining public and group-specific stances around policy proposals or +controversial topics. In contrast with sentiment analysis which focuses on +identifying prevailing emotions, stance detection identifies precise positions +(i.e., supportive, opposing, neutral) relative to a well-defined topic, such as +perceptions toward specific global health interventions during the COVID-19 +pandemic. Traditional stance detection models, while effective within their +specific domain (e.g., attitudes towards masking protocols during COVID-19), +often lag in performance when applied to new domains and topics due to changes +in data distribution. This limitation is compounded by the scarcity of +domain-specific, labeled datasets, which are expensive and labor-intensive to +create. A solution we present in this paper combines counterfactual data +augmentation with contrastive learning to enhance the robustness of stance +detection across domains and topics of interest. We evaluate the performance of +current state-of-the-art stance detection models, including a prompt-optimized +large language model, relative to our proposed framework succinctly called +STANCE-C3 (domain-adaptive Cross-target STANCE detection via Contrastive +learning and Counterfactual generation). Empirical evaluations demonstrate +STANCE-C3's consistent improvements over the baseline models with respect to +accuracy across domains and varying focal topics. Despite the increasing +prevalence of general-purpose models such as generative AI, specialized models +such as STANCE-C3 provide utility in safety-critical domains wherein precision +is highly valued, especially when a nuanced understanding of the concerns of +different population segments could result in crafting more impactful public +policies. + +
+
+
+
+
+ + ♻ ☆ Distilling Event Sequence Knowledge From Large Language Models ISWC + + +
+ Event sequence models have been found to be highly effective in the analysis +and prediction of events. Building such models requires availability of +abundant high-quality event sequence data. In certain applications, however, +clean structured event sequences are not available, and automated sequence +extraction results in data that is too noisy and incomplete. In this work, we +explore the use of Large Language Models (LLMs) to generate event sequences +that can effectively be used for probabilistic event model construction. This +can be viewed as a mechanism of distilling event sequence knowledge from LLMs. +Our approach relies on a Knowledge Graph (KG) of event concepts with partial +causal relations to guide the generative language model for causal event +sequence generation. We show that our approach can generate high-quality event +sequences, filling a knowledge gap in the input KG. Furthermore, we explore how +the generated sequences can be leveraged to discover useful and more complex +structured knowledge from pattern mining and probabilistic event models. We +release our sequence generation code and evaluation framework, as well as +corpus of event sequence data. + +
+
+ comment: In Proceedings of 23rd International Semantic Web Conference (ISWC), + 2024 +
+
+
+
+
+ + ♻ ☆ Spoken Word2Vec: Learning Skipgram Embeddings from Speech + + +
+ Text word embeddings that encode distributional semantics work by modeling +contextual similarities of frequently occurring words. Acoustic word +embeddings, on the other hand, typically encode low-level phonetic +similarities. Semantic embeddings for spoken words have been previously +explored using analogous algorithms to Word2Vec, but the resulting vectors +still mainly encoded phonetic rather than semantic features. In this paper, we +examine the assumptions and architectures used in previous works and show +experimentally how shallow skipgram-like algorithms fail to encode +distributional semantics when the input units are acoustically correlated. We +illustrate the potential of an alternative deep end-to-end variant of the model +and examine the effects on the resulting embeddings, showing positive results +of semantic relatedness in the embedding space. + +
+
+
+
+
+ + ♻ ☆ QuST-LLM: Integrating Large Language Models for Comprehensive Spatial + Transcriptomics Analysis + + +
+ In this paper, we introduce QuST-LLM, an innovative extension of QuPath that +utilizes the capabilities of large language models (LLMs) to analyze and +interpret spatial transcriptomics (ST) data. In addition to simplifying the +intricate and high-dimensional nature of ST data by offering a comprehensive +workflow that includes data loading, region selection, gene expression +analysis, and functional annotation, QuST-LLM employs LLMs to transform complex +ST data into understandable and detailed biological narratives based on gene +ontology annotations, thereby significantly improving the interpretability of +ST data. Consequently, users can interact with their own ST data using natural +language. Hence, QuST-LLM provides researchers with a potent functionality to +unravel the spatial and functional complexities of tissues, fostering novel +insights and advancements in biomedical research. QuST-LLM is a part of QuST +project. The source code is hosted on GitHub and documentation is available at +(https://github.com/huangch/qust). + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Detecting Edited Knowledge in Language Models + + +
+ Knowledge editing methods (KEs) can update language models' obsolete or +inaccurate knowledge learned from pre-training. However, KEs can be used for +malicious applications, e.g., inserting misinformation and toxic content. +Knowing whether a generated output is based on edited knowledge or first-hand +knowledge from pre-training can increase users' trust in generative models and +provide more transparency. Driven by this, we propose a novel task: detecting +edited knowledge in language models. Given an edited model and a fact retrieved +by a prompt from an edited model, the objective is to classify the knowledge as +either unedited (based on the pre-training), or edited (based on subsequent +editing). We instantiate the task with four KEs, two LLMs, and two datasets. +Additionally, we propose using the hidden state representations and the +probability distributions as features for the detection. Our results reveal +that, using these features as inputs to a simple AdaBoost classifiers +establishes a strong baseline. This classifier requires only a limited amount +of data and maintains its performance even in cross-domain settings. Last, we +find it more challenging to distinguish edited knowledge from unedited but +related knowledge, highlighting the need for further research. Our work lays +the groundwork for addressing malicious model editing, which is a critical +challenge associated with the strong generative capabilities of LLMs. + +
+
+
+
+
+ + ♻ ☆ Dual Process Learning: Controlling Use of In-Context vs. In-Weights + Strategies with Weight Forgetting + + +
+ Language models have the ability to perform in-context learning (ICL), +allowing them to flexibly adapt their behavior based on context. This contrasts +with in-weights learning, where information is statically encoded in model +parameters from iterated observations of the data. Despite this apparent +ability to learn in-context, language models are known to struggle when faced +with unseen or rarely seen tokens. Hence, we study $\textbf{structural +in-context learning}$, which we define as the ability of a model to execute +in-context learning on arbitrary tokens -- so called because the model must +generalize on the basis of e.g. sentence structure or task structure, rather +than semantic content encoded in token embeddings. An ideal model would be able +to do both: flexibly deploy in-weights operations (in order to robustly +accommodate ambiguous or unknown contexts using encoded semantic information) +and structural in-context operations (in order to accommodate novel tokens). We +study structural in-context algorithms in a simple part-of-speech setting using +both practical and toy models. We find that active forgetting, a technique that +was recently introduced to help models generalize to new languages, forces +models to adopt structural in-context learning solutions. Finally, we introduce +$\textbf{temporary forgetting}$, a straightforward extension of active +forgetting that enables one to control how much a model relies on in-weights +vs. in-context solutions. Importantly, temporary forgetting allows us to induce +a $\textit{dual process strategy}$ where in-context and in-weights solutions +coexist within a single model. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Exploring the Reversal Curse and Other Deductive Logical Reasoning in + BERT and GPT-Based Large Language Models + + +
+ The term "Reversal Curse" refers to the scenario where auto-regressive +decoder large language models (LLMs), such as ChatGPT, trained on "A is B" fail +to learn "B is A," assuming that B and A are distinct and can be uniquely +identified from each other, demonstrating a basic failure of logical deduction. +This raises a red flag in the use of GPT models for certain general tasks such +as constructing knowledge graphs, considering their adherence to this symmetric +principle. In our study, we examined a bidirectional LLM, BERT, and found that +it is immune to the reversal curse. Driven by ongoing efforts to construct +biomedical knowledge graphs with LLMs, we also embarked on evaluating more +complex but essential deductive reasoning capabilities. This process included +first training encoder and decoder language models to master the intersection +and union operations on two sets and then moving on to assess their capability +to infer different combinations of union and intersection operations on three +newly created sets. The findings showed that while both encoder and decoder +language models, trained for tasks involving two sets (union/intersection), +were proficient in such scenarios, they encountered difficulties when dealing +with operations that included three sets (various combinations of union and +intersection). Our research highlights the distinct characteristics of encoder +and decoder models in simple and complex logical reasoning. In practice, the +choice between BERT and GPT should be guided by the specific requirements and +nature of the task at hand, leveraging their respective strengths in +bidirectional context comprehension and sequence prediction. + +
+
+ comment: Final revision. To appear in Patterns +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 52 + +
+
+
+ + ♻ ☆ Framing image registration as a landmark detection problem for + label-noise-aware task representation (HitR) + + +
+ Accurate image registration is pivotal in biomedical image analysis, where +selecting suitable registration algorithms demands careful consideration. While +numerous algorithms are available, the evaluation metrics to assess their +performance have remained relatively static. This study addresses this +challenge by introducing a novel evaluation metric termed Landmark Hit Rate +(HitR), which focuses on the clinical relevance of image registration accuracy. +Unlike traditional metrics such as Target Registration Error, which emphasize +subresolution differences, HitR considers whether registration algorithms +successfully position landmarks within defined confidence zones. This paradigm +shift acknowledges the inherent annotation noise in medical images, allowing +for more meaningful assessments. To equip HitR with label-noise-awareness, we +propose defining these confidence zones based on an Inter-rater Variance +analysis. Consequently, hit rate curves are computed for varying landmark zone +sizes, enabling performance measurement for a task-specific level of accuracy. +Our approach offers a more realistic and meaningful assessment of image +registration algorithms, reflecting their suitability for clinical and +biomedical applications. + +
+
+
+
+
+ + ♻ ☆ Distilling Knowledge from Text-to-Image Generative Models Improves + Visio-Linguistic Reasoning in CLIP + + +
+ Image-text contrastive models like CLIP have wide applications in zero-shot +classification, image-text retrieval, and transfer learning. However, they +often struggle on compositional visio-linguistic tasks (e.g., attribute-binding +or object-relationships) where their performance is no better than random +chance. To address this, we introduce SDS-CLIP, a lightweight and +sample-efficient distillation method to enhance CLIP's compositional +visio-linguistic reasoning. Our approach fine-tunes CLIP using a distillation +objective borrowed from large text-to-image generative models like +Stable-Diffusion, which are known for their strong visio-linguistic reasoning +abilities. On the challenging Winoground benchmark, SDS-CLIP improves the +visio-linguistic performance of various CLIP models by up to 7%, while on the +ARO dataset, it boosts performance by up to 3%. This work underscores the +potential of well-designed distillation objectives from generative models to +enhance contrastive image-text models with improved visio-linguistic reasoning +capabilities. + +
+
+ comment: Short paper +
+
+
+
+
+ + ♻ ☆ Fine-tuning can cripple your foundation model; preserving features may + be the solution + + +
+ Pre-trained foundation models, due to their enormous capacity and exposure to +vast amounts of data during pre-training, are known to have learned plenty of +real-world concepts. An important step in making these pre-trained models +effective on downstream tasks is to fine-tune them on related datasets. While +various fine-tuning methods have been devised and have been shown to be highly +effective, we observe that a fine-tuned model's ability to recognize concepts +on tasks $\textit{different}$ from the downstream one is reduced significantly +compared to its pre-trained counterpart. This is an undesirable effect of +fine-tuning as a substantial amount of resources was used to learn these +pre-trained concepts in the first place. We call this phenomenon ''concept +forgetting'' and via experiments show that most end-to-end fine-tuning +approaches suffer heavily from this side effect. To this end, we propose a +simple fix to this problem by designing a new fine-tuning method called +$\textit{LDIFS}$ (short for $\ell_2$ distance in feature space) that, while +learning new concepts related to the downstream task, allows a model to +preserve its pre-trained knowledge as well. Through extensive experiments on 10 +fine-tuning tasks we show that $\textit{LDIFS}$ significantly reduces concept +forgetting. Additionally, we show that LDIFS is highly effective in performing +continual fine-tuning on a sequence of tasks as well, in comparison with both +fine-tuning as well as continual learning baselines. + +
+
+ comment: Published in TMLR: https://openreview.net/forum?id=kfhoeZCeW7 +
+
+
+
+
+ + ♻ ☆ Towards objective and systematic evaluation of bias in artificial + intelligence for medical imaging + + +
+ Artificial intelligence (AI) models trained using medical images for clinical +tasks often exhibit bias in the form of disparities in performance between +subgroups. Since not all sources of biases in real-world medical imaging data +are easily identifiable, it is challenging to comprehensively assess how those +biases are encoded in models, and how capable bias mitigation methods are at +ameliorating performance disparities. In this article, we introduce a novel +analysis framework for systematically and objectively investigating the impact +of biases in medical images on AI models. We developed and tested this +framework for conducting controlled in silico trials to assess bias in medical +imaging AI using a tool for generating synthetic magnetic resonance images with +known disease effects and sources of bias. The feasibility is showcased by +using three counterfactual bias scenarios to measure the impact of simulated +bias effects on a convolutional neural network (CNN) classifier and the +efficacy of three bias mitigation strategies. The analysis revealed that the +simulated biases resulted in expected subgroup performance disparities when the +CNN was trained on the synthetic datasets. Moreover, reweighing was identified +as the most successful bias mitigation strategy for this setup, and we +demonstrated how explainable AI methods can aid in investigating the +manifestation of bias in the model using this framework. Developing fair AI +models is a considerable challenge given that many and often unknown sources of +biases can be present in medical imaging datasets. In this work, we present a +novel methodology to objectively study the impact of biases and mitigation +strategies on deep learning pipelines, which can support the development of +clinical AI that is robust and responsible. + +
+
+ comment: Published in the Journal of the American Medical Informatics + Association +
+
+
+
+
+ + ♻ ☆ Evaluation of Deep Learning Semantic Segmentation for Land Cover Mapping + on Multispectral, Hyperspectral and High Spatial Aerial Imagery + + +
+ In the rise of climate change, land cover mapping has become such an urgent +need in environmental monitoring. The accuracy of land cover classification has +gotten increasingly based on the improvement of remote sensing data. Land cover +classification using satellite imageries has been explored and become more +prevalent in recent years, but the methodologies remain some drawbacks of +subjective and time-consuming. Some deep learning techniques have been utilized +to overcome these limitations. However, most studies implemented just one image +type to evaluate algorithms for land cover mapping. Therefore, our study +conducted deep learning semantic segmentation in multispectral, hyperspectral, +and high spatial aerial image datasets for landcover mapping. This research +implemented a semantic segmentation method such as Unet, Linknet, FPN, and +PSPnet for categorizing vegetation, water, and others (i.e., soil and +impervious surface). The LinkNet model obtained high accuracy in IoU +(Intersection Over Union) at 0.92 in all datasets, which is comparable with +other mentioned techniques. In evaluation with different image types, the +multispectral images showed higher performance with the IoU, and F1-score are +0.993 and 0.997, respectively. Our outcome highlighted the efficiency and broad +applicability of LinkNet and multispectral image on land cover classification. +This research contributes to establishing an approach on landcover segmentation +via open source for long-term future application. + +
+
+ comment: conference, This preprint is based on the following published + conference article: Panuntun, I. A., Chen, Y.-N., Jamaluddin, I., & Tran, T. + L. C., 2023. Evaluation of Deep Learning Semantic Segmentation for Land Cover + Mapping on Multispectral, Hyperspectral and High Spatial Aerial Imagery. 44th + Asian Conference on Remote Sensing, ACRS 2023. Code 198676 +
+
+
+
+
+ + ♻ ☆ Bytes Are All You Need: Transformers Operating Directly On File Bytes + + +
+ Modern deep learning approaches usually utilize modality-specific processing. +For example, the most common deep learning approach to image classification +involves decoding image file bytes into an RGB tensor which is passed into a +neural network. Instead, we investigate modality-independent representation +learning by performing classification directly on file bytes, without the need +for decoding files at inference time. This enables models to operate on various +modalities without any hand-designed, modality-specific processing. Our model, +ByteFormer, improves ImageNet Top-1 classification accuracy by $5\%$ (from +$72.2\%$ to $77.33\%$) relative to DeIT models of similar size. Compared to +Perceiver IO, our model requires absolutely no modality-specific processing at +inference time, and uses an order of magnitude fewer parameters at equivalent +accuracy on ImageNet. We demonstrate that the same ByteFormer architecture can +perform audio classification without modifications or modality-specific +preprocessing. We achieve $95.42\%$ classification accuracy on the Speech +Commands V2 dataset (comparable to the state-of-the-art accuracy of $98.7\%$). +Additionally, we demonstrate that ByteFormer can operate jointly on images and +audio, handling joint classification without explicit knowledge of the input +modality. We release our code at +https://github.com/apple/corenet/tree/main/projects/byteformer. + +
+
+
+
+
+ + ♻ ☆ A Geometric Algorithm for Tubular Shape Reconstruction from Skeletal + Representation + + +
+ We introduce a novel approach for the reconstruction of tubular shapes from +skeletal representations. Our method processes all skeletal points as a whole, +eliminating the need for splitting input structure into multiple segments. We +represent the tubular shape as a truncated signed distance function (TSDF) in a +voxel hashing manner, in which the signed distance between a voxel center and +the object is computed through a simple geometric algorithm. Our method does +not involve any surface sampling scheme or solving large matrix equations, and +therefore is a faster and more elegant solution for tubular shape +reconstruction compared to other approaches. Experiments demonstrate the +efficiency and effectiveness of the proposed method. Code is avaliable at +https://github.com/wlsdzyzl/Dragon. + +
+
+ comment: 9 pages (without reference), 6 figures +
+
+
+
+
+ + ♻ ☆ Patch-Prompt Aligned Bayesian Prompt Tuning for Vision-Language Models UAI 2024 + + +
+ For downstream applications of vision-language pre-trained models, there has +been significant interest in constructing effective prompts. Existing works on +prompt engineering, which either require laborious manual designs or optimize +the prompt tuning as a point estimation problem, may fail to describe diverse +characteristics of categories and limit their applications. We introduce a +Bayesian probabilistic resolution to prompt tuning, where the label-specific +stochastic prompts are generated hierarchically by first sampling a latent +vector from an underlying distribution and then employing a lightweight +generative model. Importantly, we semantically regularize the tuning process by +minimizing the statistical distance between the visual patches and linguistic +prompts, which pushes the stochastic label representations to faithfully +capture diverse visual concepts, instead of overfitting the training +categories. We evaluate the effectiveness of our approach on four tasks: +few-shot image recognition, base-to-new generalization, dataset transfer +learning, and domain shifts. Extensive results over 15 datasets show promising +transferability and generalization performance of our proposed model, both +quantitatively and qualitatively. + +
+
+ comment: Accepted by UAI 2024 +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Meta-tuning for Few-shot Generalization Through + Sparse Interpolated Experts + + +
+ Recent successes suggest that parameter-efficient fine-tuning of foundation +models as the state-of-the-art method for transfer learning in vision, +replacing the rich literature of alternatives such as meta-learning. In trying +to harness the best of both worlds, meta-tuning introduces a subsequent +optimization stage of foundation models but has so far only shown limited +success and crucially tends to underperform on out-of-distribution (OOD) tasks. +In this paper, we introduce Sparse MetA-Tuning (SMAT), a method inspired by +sparse mixture-of-experts approaches and trained to isolate subsets of +pre-trained parameters automatically for meta-tuning on each task. SMAT +successfully overcomes OOD sensitivity and delivers on the promise of enhancing +the transfer abilities of vision foundation models beyond parameter-efficient +fine-tuning. We establish new state-of-the-art results on a challenging +combination of Meta-Dataset augmented with additional OOD tasks in both +zero-shot and gradient-based adaptation settings. In addition, we provide a +thorough analysis of the superiority of learned over hand-designed sparsity +patterns for sparse expert methods and the pivotal importance of the sparsity +level in balancing between in-distribution and out-of-distribution +generalization. Our code is publicly available. + +
+
+ comment: The Forty-first International Conference on Machine Learning, 2024 +
+
+
+
+
+ + ♻ ☆ An Efficient Instance Segmentation Framework Based on Oriented Bounding + Boxes + + +
+ Instance segmentation for completely occluded objects and dense objects in +robot vision measurement are two challenging tasks. To uniformly deal with +them, this paper proposes a unified coarse-to-fine instance segmentation +framework, CFNet, which uses box prompt-based segmentation foundation models +(BSMs), e.g., Segment Anything Model. Specifically, CFNet first detects +oriented bounding boxes (OBBs) to distinguish instances and provide coarse +localization information. Then, it predicts OBB prompt-related masks for fine +segmentation. CFNet performs instance segmentation with OBBs that only contain +partial object boundaries on occluders to predict occluded object instances, +which overcomes the difficulty of existing amodal instance segmentation methods +in directly predicting occluded objects. In addition, since OBBs only serve as +prompts, CFNet alleviates the over-dependence on bounding box detection +performance of current instance segmentation methods using OBBs for dense +objects. Moreover, to enable BSMs to handle OBB prompts, we propose a novel OBB +prompt encoder. To make CFNet more lightweight, we perform knowledge +distillation on it and introduce a Gaussian label smoothing method for teacher +model outputs. Experiments demonstrate that CFNet outperforms current instance +segmentation methods on both industrial and public datasets. The code is +available at https://github.com/zhen6618/OBBInstanceSegmentation. + +
+
+
+
+
+ + ♻ ☆ DreamPBR: Text-driven Generation of High-resolution SVBRDF with + Multi-modal Guidance + + +
+ Prior material creation methods had limitations in producing diverse results +mainly because reconstruction-based methods relied on real-world measurements +and generation-based methods were trained on relatively small material +datasets. To address these challenges, we propose DreamPBR, a novel +diffusion-based generative framework designed to create spatially-varying +appearance properties guided by text and multi-modal controls, providing high +controllability and diversity in material generation. Key to achieving diverse +and high-quality PBR material generation lies in integrating the capabilities +of recent large-scale vision-language models trained on billions of text-image +pairs, along with material priors derived from hundreds of PBR material +samples. We utilize a novel material Latent Diffusion Model (LDM) to establish +the mapping between albedo maps and the corresponding latent space. The latent +representation is then decoded into full SVBRDF parameter maps using a +rendering-aware PBR decoder. Our method supports tileable generation through +convolution with circular padding. Furthermore, we introduce a multi-modal +guidance module, which includes pixel-aligned guidance, style image guidance, +and 3D shape guidance, to enhance the control capabilities of the material LDM. +We demonstrate the effectiveness of DreamPBR in material creation, showcasing +its versatility and user-friendliness on a wide range of controllable +generation and editing applications. + +
+
+ comment: 16 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Jailbreak Vision Language Models via Bi-Modal Adversarial Prompt + + +
+ In the realm of large vision language models (LVLMs), jailbreak attacks serve +as a red-teaming approach to bypass guardrails and uncover safety implications. +Existing jailbreaks predominantly focus on the visual modality, perturbing +solely visual inputs in the prompt for attacks. However, they fall short when +confronted with aligned models that fuse visual and textual features +simultaneously for generation. To address this limitation, this paper +introduces the Bi-Modal Adversarial Prompt Attack (BAP), which executes +jailbreaks by optimizing textual and visual prompts cohesively. Initially, we +adversarially embed universally harmful perturbations in an image, guided by a +few-shot query-agnostic corpus (e.g., affirmative prefixes and negative +inhibitions). This process ensures that image prompt LVLMs to respond +positively to any harmful queries. Subsequently, leveraging the adversarial +image, we optimize textual prompts with specific harmful intent. In particular, +we utilize a large language model to analyze jailbreak failures and employ +chain-of-thought reasoning to refine textual prompts through a +feedback-iteration manner. To validate the efficacy of our approach, we +conducted extensive evaluations on various datasets and LVLMs, demonstrating +that our method significantly outperforms other methods by large margins +(+29.03% in attack success rate on average). Additionally, we showcase the +potential of our attacks on black-box commercial LVLMs, such as Gemini and +ChatGLM. + +
+
+
+
+
+ + ♻ ☆ Topo4D: Topology-Preserving Gaussian Splatting for High-Fidelity 4D Head + Capture + + +
+ 4D head capture aims to generate dynamic topological meshes and corresponding +texture maps from videos, which is widely utilized in movies and games for its +ability to simulate facial muscle movements and recover dynamic textures in +pore-squeezing. The industry often adopts the method involving multi-view +stereo and non-rigid alignment. However, this approach is prone to errors and +heavily reliant on time-consuming manual processing by artists. To simplify +this process, we propose Topo4D, a novel framework for automatic geometry and +texture generation, which optimizes densely aligned 4D heads and 8K texture +maps directly from calibrated multi-view time-series images. Specifically, we +first represent the time-series faces as a set of dynamic 3D Gaussians with +fixed topology in which the Gaussian centers are bound to the mesh vertices. +Afterward, we perform alternative geometry and texture optimization +frame-by-frame for high-quality geometry and texture learning while maintaining +temporal topology stability. Finally, we can extract dynamic facial meshes in +regular wiring arrangement and high-fidelity textures with pore-level details +from the learned Gaussians. Extensive experiments show that our method achieves +superior results than the current SOTA face reconstruction methods both in the +quality of meshes and textures. Project page: +https://xuanchenli.github.io/Topo4D/. + +
+
+
+
+
+ + ♻ ☆ Instruction-Guided Scene Text Recognition + + +
+ Multi-modal models show appealing performance in visual recognition tasks +recently, as free-form text-guided training evokes the ability to understand +fine-grained visual content. However, current models are either inefficient or +cannot be trivially upgraded to scene text recognition (STR) due to the +composition difference between natural and text images. We propose a novel +instruction-guided scene text recognition (IGTR) paradigm that formulates STR +as an instruction learning problem and understands text images by predicting +character attributes, e.g., character frequency, position, etc. IGTR first +devises $\left \langle condition,question,answer\right \rangle$ instruction +triplets, providing rich and diverse descriptions of character attributes. To +effectively learn these attributes through question-answering, IGTR develops +lightweight instruction encoder, cross-modal feature fusion module and +multi-task answer head, which guides nuanced text image understanding. +Furthermore, IGTR realizes different recognition pipelines simply by using +different instructions, enabling a character-understanding-based text reasoning +paradigm that considerably differs from current methods. Experiments on English +and Chinese benchmarks show that IGTR outperforms existing models by +significant margins, while maintaining a small model size and efficient +inference speed. Moreover, by adjusting the sampling of instructions, IGTR +offers an elegant way to tackle the recognition of both rarely appearing and +morphologically similar characters, which were previous challenges. Code at +\href{https://github.com/Topdu/OpenOCR}{this http URL}. + +
+
+
+
+
+ + ♻ ☆ Local-Aware Global Attention Network for Person Re-Identification Based + on Body and Hand Images + + +
+ Learning representative, robust and discriminative information from images is +essential for effective person re-identification (Re-Id). In this paper, we +propose a compound approach for end-to-end discriminative deep feature learning +for person Re-Id based on both body and hand images. We carefully design the +Local-Aware Global Attention Network (LAGA-Net), a multi-branch deep network +architecture consisting of one branch for spatial attention, one branch for +channel attention, one branch for global feature representations and another +branch for local feature representations. The attention branches focus on the +relevant features of the image while suppressing the irrelevant backgrounds. In +order to overcome the weakness of the attention mechanisms, equivariant to +pixel shuffling, we integrate relative positional encodings into the spatial +attention module to capture the spatial positions of pixels. The global branch +intends to preserve the global context or structural information. For the the +local branch, which intends to capture the fine-grained information, we perform +uniform partitioning to generate stripes on the conv-layer horizontally. We +retrieve the parts by conducting a soft partition without explicitly +partitioning the images or requiring external cues such as pose estimation. A +set of ablation study shows that each component contributes to the increased +performance of the LAGA-Net. Extensive evaluations on four popular body-based +person Re-Id benchmarks and two publicly available hand datasets demonstrate +that our proposed method consistently outperforms existing state-of-the-art +methods. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2108.02234 +
+
+
+
+
+ + ♻ ☆ CILF-CIAE: CLIP-driven Image-Language Fusion for Correcting Inverse Age + Estimation + + +
+ The age estimation task aims to predict the age of an individual by analyzing +facial features in an image. The development of age estimation can improve the +efficiency and accuracy of various applications (e.g., age verification and +secure access control, etc.). In recent years, contrastive language-image +pre-training (CLIP) has been widely used in various multimodal tasks and has +made some progress in the field of age estimation. However, existing CLIP-based +age estimation methods require high memory usage (quadratic complexity) when +globally modeling images, and lack an error feedback mechanism to prompt the +model about the quality of age prediction results. To tackle the above issues, +we propose a novel CLIP-driven Image-Language Fusion for Correcting Inverse Age +Estimation (CILF-CIAE). Specifically, we first introduce the CLIP model to +extract image features and text semantic information respectively, and map them +into a highly semantically aligned high-dimensional feature space. Next, we +designed a new Transformer architecture (i.e., FourierFormer) to achieve +channel evolution and spatial interaction of images, and to fuse image and text +semantic information. Compared with the quadratic complexity of the attention +mechanism, the proposed Fourierformer is of linear log complexity. To further +narrow the semantic gap between image and text features, we utilize an +efficient contrastive multimodal learning module that supervises the multimodal +fusion process of FourierFormer through contrastive loss for image-text +matching, thereby improving the interaction effect between different +modalities. Finally, we introduce reversible age estimation, which uses +end-to-end error feedback to reduce the error rate of age predictions. Through +extensive experiments on multiple data sets, CILF-CIAE has achieved better age +prediction results. + +
+
+ comment: 14 pages, 14 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ WIA-LD2ND: Wavelet-based Image Alignment for Self-supervised Low-Dose CT + Denoising MICCAI2024 + + +
+ In clinical examinations and diagnoses, low-dose computed tomography (LDCT) +is crucial for minimizing health risks compared with normal-dose computed +tomography (NDCT). However, reducing the radiation dose compromises the +signal-to-noise ratio, leading to degraded quality of CT images. To address +this, we analyze LDCT denoising task based on experimental results from the +frequency perspective, and then introduce a novel self-supervised CT image +denoising method called WIA-LD2ND, only using NDCT data. The proposed WIA-LD2ND +comprises two modules: Wavelet-based Image Alignment (WIA) and Frequency-Aware +Multi-scale Loss (FAM). First, WIA is introduced to align NDCT with LDCT by +mainly adding noise to the high-frequency components, which is the main +difference between LDCT and NDCT. Second, to better capture high-frequency +components and detailed information, Frequency-Aware Multi-scale Loss (FAM) is +proposed by effectively utilizing multi-scale feature space. Extensive +experiments on two public LDCT denoising datasets demonstrate that our +WIA-LD2ND, only uses NDCT, outperforms existing several state-of-the-art +weakly-supervised and self-supervised methods. Source code is available at +https://github.com/zhaohaoyu376/WI-LD2ND. + +
+
+ comment: MICCAI2024 +
+
+
+
+
+ + ♻ ☆ MoreStyle: Relax Low-frequency Constraint of Fourier-based Image + Reconstruction in Generalizable Medical Image Segmentation MICCAI2024 + + +
+ The task of single-source domain generalization (SDG) in medical image +segmentation is crucial due to frequent domain shifts in clinical image +datasets. To address the challenge of poor generalization across different +domains, we introduce a Plug-and-Play module for data augmentation called +MoreStyle. MoreStyle diversifies image styles by relaxing low-frequency +constraints in Fourier space, guiding the image reconstruction network. With +the help of adversarial learning, MoreStyle further expands the style range and +pinpoints the most intricate style combinations within latent features. To +handle significant style variations, we introduce an uncertainty-weighted loss. +This loss emphasizes hard-to-classify pixels resulting only from style shifts +while mitigating true hard-to-classify pixels in both MoreStyle-generated and +original images. Extensive experiments on two widely used benchmarks +demonstrate that the proposed MoreStyle effectively helps to achieve good +domain generalization ability, and has the potential to further boost the +performance of some state-of-the-art SDG methods. Source code is available at +https://github.com/zhaohaoyu376/morestyle. + +
+
+ comment: MICCAI2024 +
+
+
+
+
+ + ♻ ☆ Recovering the Pre-Fine-Tuning Weights of Generative Models ICML 2024 + + +
+ The dominant paradigm in generative modeling consists of two steps: i) +pre-training on a large-scale but unsafe dataset, ii) aligning the pre-trained +model with human values via fine-tuning. This practice is considered safe, as +no current method can recover the unsafe, pre-fine-tuning model weights. In +this paper, we demonstrate that this assumption is often false. Concretely, we +present Spectral DeTuning, a method that can recover the weights of the +pre-fine-tuning model using a few low-rank (LoRA) fine-tuned models. In +contrast to previous attacks that attempt to recover pre-fine-tuning +capabilities, our method aims to recover the exact pre-fine-tuning weights. Our +approach exploits this new vulnerability against large-scale models such as a +personalized Stable Diffusion and an aligned Mistral. + +
+
+ comment: ICML 2024. Project page: https://vision.huji.ac.il/spectral_detuning/ +
+
+
+
+
+ + ♻ ☆ Training-Free Deepfake Voice Recognition by Leveraging Large-Scale + Pre-Trained Models + + +
+ Generalization is a main issue for current audio deepfake detectors, which +struggle to provide reliable results on out-of-distribution data. Given the +speed at which more and more accurate synthesis methods are developed, it is +very important to design techniques that work well also on data they were not +trained for. In this paper we study the potential of large-scale pre-trained +models for audio deepfake detection, with special focus on generalization +ability. To this end, the detection problem is reformulated in a speaker +verification framework and fake audios are exposed by the mismatch between the +voice sample under test and the voice of the claimed identity. With this +paradigm, no fake speech sample is necessary in training, cutting off any link +with the generation method at the root, and ensuring full generalization +ability. Features are extracted by general-purpose large pre-trained models, +with no need for training or fine-tuning on specific fake detection or speaker +verification datasets. At detection time only a limited set of voice fragments +of the identity under test is required. Experiments on several datasets +widespread in the community show that detectors based on pre-trained models +achieve excellent performance and show strong generalization ability, rivaling +supervised methods on in-distribution data and largely overcoming them on +out-of-distribution data. + +
+
+
+
+
+ + ♻ ☆ Adaptively Bypassing Vision Transformer Blocks for Efficient Visual + Tracking + + +
+ Empowered by transformer-based models, visual tracking has advanced +significantly. However, the slow speed of current trackers limits their +applicability on devices with constrained computational resources. To address +this challenge, we introduce ABTrack, an adaptive computation framework that +adaptively bypassing transformer blocks for efficient visual tracking. The +rationale behind ABTrack is rooted in the observation that semantic features or +relations do not uniformly impact the tracking task across all abstraction +levels. Instead, this impact varies based on the characteristics of the target +and the scene it occupies. Consequently, disregarding insignificant semantic +features or relations at certain abstraction levels may not significantly +affect the tracking accuracy. We propose a Bypass Decision Module (BDM) to +determine if a transformer block should be bypassed, which adaptively +simplifies the architecture of ViTs and thus speeds up the inference process. +To counteract the time cost incurred by the BDMs and further enhance the +efficiency of ViTs, we introduce a novel ViT pruning method to reduce the +dimension of the latent representation of tokens in each transformer block. +Extensive experiments on multiple tracking benchmarks validate the +effectiveness and generality of the proposed method and show that it achieves +state-of-the-art performance. Code is released at: +https://github.com/xyyang317/ABTrack. + +
+
+
+
+
+ + ♻ ☆ AdaCL:Adaptive Continual Learning + + +
+ Class-Incremental Learning aims to update a deep classifier to learn new +categories while maintaining or improving its accuracy on previously observed +classes. Common methods to prevent forgetting previously learned classes +include regularizing the neural network updates and storing exemplars in +memory, which come with hyperparameters such as the learning rate, +regularization strength, or the number of exemplars. However, these +hyperparameters are usually only tuned at the start and then kept fixed +throughout the learning sessions, ignoring the fact that newly encountered +tasks may have varying levels of novelty or difficulty. This study investigates +the necessity of hyperparameter `adaptivity' in Class-Incremental Learning: the +ability to dynamically adjust hyperparameters such as the learning rate, +regularization strength, and memory size according to the properties of the new +task at hand. We propose AdaCL, a Bayesian Optimization-based approach to +automatically and efficiently determine the optimal values for those parameters +with each learning task. We show that adapting hyperpararmeters on each new +task leads to improvement in accuracy, forgetting and memory. Code is available +at https://github.com/ElifCerenGokYildirim/AdaCL. + +
+
+ comment: Published in 1st ContinualAI Unconference +
+
+
+
+
+ + ♻ ☆ Woven Fabric Capture with a Reflection-Transmission Photo Pair SIGGRAPH 2024 + + +
+ Digitizing woven fabrics would be valuable for many applications, from +digital humans to interior design. Previous work introduces a lightweight woven +fabric acquisition approach by capturing a single reflection image and +estimating the fabric parameters with a differentiable geometric and shading +model. The renderings of the estimated fabric parameters can closely match the +photo; however, the captured reflection image is insufficient to fully +characterize the fabric sample reflectance. For instance, fabrics with +different thicknesses might have similar reflection images but lead to +significantly different transmission. We propose to recover the woven fabric +parameters from two captured images: reflection and transmission. At the core +of our method is a differentiable bidirectional scattering distribution +function (BSDF) model, handling reflection and transmission, including single +and multiple scattering. We propose a two-layer model, where the single +scattering uses an SGGX phase function as in previous work, and multiple +scattering uses a new azimuthally-invariant microflake definition, which we +term ASGGX. This new fabric BSDF model closely matches real woven fabrics in +both reflection and transmission. We use a simple setup for capturing +reflection and transmission photos with a cell phone camera and two point +lights, and estimate the fabric parameters via a lightweight network, together +with a differentiable optimization. We also model the out-of-focus effects +explicitly with a simple solution to match the thin-lens camera better. As a +result, the renderings of the estimated parameters can agree with the input +images on both reflection and transmission for the first time. The code for +this paper is at https://github.com/lxtyin/FabricBTDF-Recovery. + +
+
+ comment: 10 pages, 16 figures (in the main paper). Accepted by SIGGRAPH 2024 + conference +
+
+
+
+
+ + ♻ ☆ Towards Robust Physical-world Backdoor Attacks on Lane Detection + + +
+ Deep learning-based lane detection (LD) plays a critical role in autonomous +driving systems, such as adaptive cruise control. However, it is vulnerable to +backdoor attacks. Existing backdoor attack methods on LD exhibit limited +effectiveness in dynamic real-world scenarios, primarily because they fail to +consider dynamic scene factors, including changes in driving perspectives +(e.g., viewpoint transformations) and environmental conditions (e.g., weather +or lighting changes). To tackle this issue, this paper introduces BadLANE, a +dynamic scene adaptation backdoor attack for LD designed to withstand changes +in real-world dynamic scene factors. To address the challenges posed by +changing driving perspectives, we propose an amorphous trigger pattern composed +of shapeless pixels. This trigger design allows the backdoor to be activated by +various forms or shapes of mud spots or pollution on the road or lens, enabling +adaptation to changes in vehicle observation viewpoints during driving. To +mitigate the effects of environmental changes, we design a meta-learning +framework to train meta-generators tailored to different environmental +conditions. These generators produce meta-triggers that incorporate diverse +environmental information, such as weather or lighting conditions, as the +initialization of the trigger patterns for backdoor implantation, thus enabling +adaptation to dynamic environments. Extensive experiments on various commonly +used LD models in both digital and physical domains validate the effectiveness +of our attacks, outperforming other baselines significantly (+25.15% on average +in Attack Success Rate). Our codes will be available upon paper publication. + +
+
+
+
+
+ + ♻ ☆ Training-Free Acceleration of ViTs with Delayed Spatial Merging ICML 2024 + + +
+ Token merging has emerged as a new paradigm that can accelerate the inference +of Vision Transformers (ViTs) without any retraining or fine-tuning. To push +the frontier of training-free acceleration in ViTs, we improve token merging by +adding the perspectives of 1) activation outliers and 2) hierarchical +representations. Through a careful analysis of the attention behavior in ViTs, +we characterize a delayed onset of the convergent attention phenomenon, which +makes token merging undesirable in the bottom blocks of ViTs. Moreover, we +augment token merging with a hierarchical processing scheme to capture +multi-scale redundancy between visual tokens. Combining these two insights, we +build a unified inference framework called DSM: Delayed Spatial Merging. We +extensively evaluate DSM on various ViT model scales (Tiny to Huge) and tasks +(ImageNet-1k and transfer learning), achieving up to 1.8$\times$ FLOP reduction +and 1.6$\times$ throughput speedup at a negligible loss while being two orders +of magnitude faster than existing methods. + +
+
+ comment: ICML 2024 ES-FoMo Workshop +
+
+
+
+
+ + ♻ ☆ Multimodal Learning With Intraoperative CBCT & Variably Aligned + Preoperative CT Data To Improve Segmentation MICCAI + + +
+ Cone-beam computed tomography (CBCT) is an important tool facilitating +computer aided interventions, despite often suffering from artifacts that pose +challenges for accurate interpretation. While the degraded image quality can +affect downstream segmentation, the availability of high quality, preoperative +scans represents potential for improvements. Here we consider a setting where +preoperative CT and intraoperative CBCT scans are available, however, the +alignment (registration) between the scans is imperfect. We propose a +multimodal learning method that fuses roughly aligned CBCT and CT scans and +investigate the effect of CBCT quality and misalignment on the final +segmentation performance. For that purpose, we make use of a synthetically +generated data set containing real CT and synthetic CBCT volumes. As an +application scenario, we focus on liver and liver tumor segmentation. We show +that the fusion of preoperative CT and simulated, intraoperative CBCT mostly +improves segmentation performance (compared to using intraoperative CBCT only) +and that even clearly misaligned preoperative data has the potential to improve +segmentation performance. + +
+
+ comment: Submitted to SASHIMI2024 (MICCAI workshop) +
+
+
+
+
+ + ♻ ☆ Fuzzy Attention-based Border Rendering Network for Lung Organ + Segmentation MICCAI 2024 + + +
+ Automatic lung organ segmentation on CT images is crucial for lung disease +diagnosis. However, the unlimited voxel values and class imbalance of lung +organs can lead to false-negative/positive and leakage issues in advanced +methods. Additionally, some slender lung organs are easily lost during the +recycled down/up-sample procedure, e.g., bronchioles & arterioles, causing +severe discontinuity issue. Inspired by these, this paper introduces an +effective lung organ segmentation method called Fuzzy Attention-based Border +Rendering (FABR) network. Since fuzzy logic can handle the uncertainty in +feature extraction, hence the fusion of deep networks and fuzzy sets should be +a viable solution for better performance. Meanwhile, unlike prior top-tier +methods that operate on all regular dense points, our FABR depicts lung organ +regions as cube-trees, focusing only on recycle-sampled border vulnerable +points, rendering the severely discontinuous, false-negative/positive organ +regions with a novel Global-Local Cube-tree Fusion (GLCF) module. All +experimental results, on four challenging datasets of airway & artery, +demonstrate that our method can achieve the favorable performance +significantly. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Exploring the Potential of Multi-Modal AI for Driving Hazard Prediction + + +
+ This paper addresses the problem of predicting hazards that drivers may +encounter while driving a car. We formulate it as a task of anticipating +impending accidents using a single input image captured by car dashcams. Unlike +existing approaches to driving hazard prediction that rely on computational +simulations or anomaly detection from videos, this study focuses on high-level +inference from static images. The problem needs predicting and reasoning about +future events based on uncertain observations, which falls under visual +abductive reasoning. To enable research in this understudied area, a new +dataset named the DHPR (Driving Hazard Prediction and Reasoning) dataset is +created. The dataset consists of 15K dashcam images of street scenes, and each +image is associated with a tuple containing car speed, a hypothesized hazard +description, and visual entities present in the scene. These are annotated by +human annotators, who identify risky scenes and provide descriptions of +potential accidents that could occur a few seconds later. We present several +baseline methods and evaluate their performance on our dataset, identifying +remaining issues and discussing future directions. This study contributes to +the field by introducing a novel problem formulation and dataset, enabling +researchers to explore the potential of multi-modal AI for driving hazard +prediction. + +
+
+ comment: Main Paper: 11 pages, Supplementary Materials: 25 pages +
+
+
+
+
+ + ♻ ☆ PosterLLaVa: Constructing a Unified Multi-modal Layout Generator with + LLM + + +
+ Layout generation is the keystone in achieving automated graphic design, +requiring arranging the position and size of various multi-modal design +elements in a visually pleasing and constraint-following manner. Previous +approaches are either inefficient for large-scale applications or lack +flexibility for varying design requirements. Our research introduces a unified +framework for automated graphic layout generation, leveraging the multi-modal +large language model (MLLM) to accommodate diverse design tasks. In contrast, +our data-driven method employs structured text (JSON format) and visual +instruction tuning to generate layouts under specific visual and textual +constraints, including user-defined natural language specifications. We +conducted extensive experiments and achieved state-of-the-art (SOTA) +performance on public multi-modal layout generation benchmarks, demonstrating +the effectiveness of our method. Moreover, recognizing existing datasets' +limitations in capturing the complexity of real-world graphic designs, we +propose two new datasets for much more challenging tasks (user-constrained +generation and complicated poster), further validating our model's utility in +real-life settings. Marking by its superior accessibility and adaptability, +this approach further automates large-scale graphic design tasks. The code and +datasets will be publicly available on +https://github.com/posterllava/PosterLLaVA. + +
+
+ comment: 10 pages; typos corrected, appendix added +
+
+
+
+
+ + ♻ ☆ DynamicGlue: Epipolar and Time-Informed Data Association in Dynamic + Environments using Graph Neural Networks + + +
+ The assumption of a static environment is common in many geometric computer +vision tasks like SLAM but limits their applicability in highly dynamic scenes. +Since these tasks rely on identifying point correspondences between input +images within the static part of the environment, we propose a graph neural +network-based sparse feature matching network designed to perform robust +matching under challenging conditions while excluding keypoints on moving +objects. We employ a similar scheme of attentional aggregation over graph edges +to enhance keypoint representations as state-of-the-art feature-matching +networks but augment the graph with epipolar and temporal information and +vastly reduce the number of graph edges. Furthermore, we introduce a +self-supervised training scheme to extract pseudo labels for image pairs in +dynamic environments from exclusively unprocessed visual-inertial data. A +series of experiments show the superior performance of our network as it +excludes keypoints on moving objects compared to state-of-the-art feature +matching networks while still achieving similar results regarding conventional +matching metrics. When integrated into a SLAM system, our network significantly +improves performance, especially in highly dynamic scenes. + +
+
+
+
+
+ + ♻ ☆ E-ANT: A Large-Scale Dataset for Efficient Automatic GUI NavigaTion + + +
+ Online GUI navigation on mobile devices has driven a lot of attention recent +years since it contributes to many real-world applications. With the rapid +development of large language models (LLM), multimodal large language models +(MLLM) have tremendous potential on this task. However, existing MLLMs need +high quality data to improve its abilities of making the correct navigation +decisions according to the human user inputs. In this paper, we developed a +novel and highly valuable dataset, named \textbf{E-ANT}, as the first Chinese +GUI navigation dataset that contains real human behaviour and high quality +screenshots with annotations, containing nearly 40,000 real human traces over +5000+ different tinyAPPs. Furthermore, we evaluate various powerful MLLMs on +E-ANT and show their experiments results with sufficient ablations. We believe +that our proposed dataset will be beneficial for both the evaluation and +development of GUI navigation and LLM/MLLM decision-making capabilities. + +
+
+ comment: 9 pages, 5 figures, Under review +
+
+
+
+
+ + ♻ ☆ VIPriors 4: Visual Inductive Priors for Data-Efficient Deep Learning + Challenges + + +
+ The fourth edition of the "VIPriors: Visual Inductive Priors for +Data-Efficient Deep Learning" workshop features two data-impaired challenges. +These challenges address the problem of training deep learning models for +computer vision tasks with limited data. Participants are limited to training +models from scratch using a low number of training samples and are not allowed +to use any form of transfer learning. We aim to stimulate the development of +novel approaches that incorporate inductive biases to improve the data +efficiency of deep learning models. Significant advancements are made compared +to the provided baselines, where winning solutions surpass the baselines by a +considerable margin in both tasks. As in previous editions, these achievements +are primarily attributed to heavy use of data augmentation policies and large +model ensembles, though novel prior-based methods seem to contribute more to +successful solutions compared to last year. This report highlights the key +aspects of the challenges and their outcomes. + +
+
+
+
+
+ + ♻ ☆ Training morphological neural networks with gradient descent: some + theoretical insights + + +
+ Morphological neural networks, or layers, can be a powerful tool to boost the +progress in mathematical morphology, either on theoretical aspects such as the +representation of complete lattice operators, or in the development of image +processing pipelines. However, these architectures turn out to be difficult to +train when they count more than a few morphological layers, at least within +popular machine learning frameworks which use gradient descent based +optimization algorithms. In this paper we investigate the potential and +limitations of differentiation based approaches and back-propagation applied to +morphological networks, in light of the non-smooth optimization concept of +Bouligand derivative. We provide insights and first theoretical guidelines, in +particular regarding initialization and learning rates. + +
+
+
+
+
+ + ♻ ☆ YOLOv10 to Its Genesis: A Decadal and Comprehensive Review of The You + Only Look Once Series + + +
+ This review systematically examines the progression of the You Only Look Once +(YOLO) object detection algorithms from YOLOv1 to the recently unveiled +YOLOv10. Employing a reverse chronological analysis, this study examines the +advancements introduced by YOLO algorithms, beginning with YOLOv10 and +progressing through YOLOv9, YOLOv8, and subsequent versions to explore each +version's contributions to enhancing speed, accuracy, and computational +efficiency in real-time object detection. The study highlights the +transformative impact of YOLO across five critical application areas: +automotive safety, healthcare, industrial manufacturing, surveillance, and +agriculture. By detailing the incremental technological advancements in +subsequent YOLO versions, this review chronicles the evolution of YOLO, and +discusses the challenges and limitations in each earlier versions. The +evolution signifies a path towards integrating YOLO with multimodal, +context-aware, and General Artificial Intelligence (AGI) systems for the next +YOLO decade, promising significant implications for future developments in +AI-driven applications. + +
+
+ comment: 11 Figures, 7 Tables +
+
+
+
+
+ + ♻ ☆ A Simple Framework for Open-Vocabulary Zero-Shot Segmentation + + +
+ Zero-shot classification capabilities naturally arise in models trained +within a vision-language contrastive framework. Despite their classification +prowess, these models struggle in dense tasks like zero-shot open-vocabulary +segmentation. This deficiency is often attributed to the absence of +localization cues in captions and the intertwined nature of the learning +process, which encompasses both image representation learning and +cross-modality alignment. To tackle these issues, we propose SimZSS, a Simple +framework for open-vocabulary Zero-Shot Segmentation. The method is founded on +two key principles: i) leveraging frozen vision-only models that exhibit +spatial awareness while exclusively aligning the text encoder and ii) +exploiting the discrete nature of text and linguistic knowledge to pinpoint +local concepts within captions. By capitalizing on the quality of the visual +representations, our method requires only image-caption pairs datasets and +adapts to both small curated and large-scale noisy datasets. When trained on +COCO Captions across 8 GPUs, SimZSS achieves state-of-the-art results on 7 out +of 8 benchmark datasets in less than 15 minutes. + +
+
+
+
+
+ + ♻ ☆ VTG-LLM: Integrating Timestamp Knowledge into Video LLMs for Enhanced + Video Temporal Grounding + + +
+ Video Temporal Grounding (VTG) focuses on accurately identifying event +timestamps within a particular video based on a linguistic query, playing a +vital role in downstream tasks such as video browsing and editing. While Video +Large Language Models (video LLMs) have made significant progress in +understanding video content, they often face challenges in accurately +pinpointing timestamps within videos, which limits their performance on VTG +tasks. Therefore, to improve video LLMs' ability to effectively locate +timestamps, we argue that two critical aspects need to be enhanced. First, it +is essential to have high-quality instructional tuning datasets that encompass +mainstream VTG tasks. Second, directly incorporating timestamp knowledge into +video LLMs is crucial, as it enables models to efficiently comprehend timestamp +information. To address these needs, we first introduce VTG-IT-120K, a +high-quality and comprehensive instruction tuning dataset that covers VTG tasks +such as moment retrieval, dense video captioning, video summarization, and +video highlight detection. Furthermore, we propose a specially designed video +LLM model for VTG tasks, VTG-LLM, which (1) effectively integrates timestamp +knowledge into visual tokens; (2) incorporates absolute-time tokens that +specifically handle timestamp knowledge, thereby avoiding concept shifts; and +(3) introduces a lightweight, high-performance slot-based token compression +method to facilitate the sampling of more video frames. Comprehensive +experiments showcase the superior performance of VTG-LLM in comparison to other +video LLM methods across various VTG tasks. Our code and datasets are available +at \url{https://github.com/gyxxyg/VTG-LLM}. + +
+
+
+
+
+ + ♻ ☆ RoadFormer: Duplex Transformer for RGB-Normal Semantic Road Scene + Parsing + + +
+ The recent advancements in deep convolutional neural networks have shown +significant promise in the domain of road scene parsing. Nevertheless, the +existing works focus primarily on freespace detection, with little attention +given to hazardous road defects that could compromise both driving safety and +comfort. In this paper, we introduce RoadFormer, a novel Transformer-based +data-fusion network developed for road scene parsing. RoadFormer utilizes a +duplex encoder architecture to extract heterogeneous features from both RGB +images and surface normal information. The encoded features are subsequently +fed into a novel heterogeneous feature synergy block for effective feature +fusion and recalibration. The pixel decoder then learns multi-scale long-range +dependencies from the fused and recalibrated heterogeneous features, which are +subsequently processed by a Transformer decoder to produce the final semantic +prediction. Additionally, we release SYN-UDTIRI, the first large-scale road +scene parsing dataset that contains over 10,407 RGB images, dense depth images, +and the corresponding pixel-level annotations for both freespace and road +defects of different shapes and sizes. Extensive experimental evaluations +conducted on our SYN-UDTIRI dataset, as well as on three public datasets, +including KITTI road, CityScapes, and ORFD, demonstrate that RoadFormer +outperforms all other state-of-the-art networks for road scene parsing. +Specifically, RoadFormer ranks first on the KITTI road benchmark. Our source +code, created dataset, and demo video are publicly available at +mias.group/RoadFormer. + +
+
+ comment: 10 pages 7 figures. Accepted by Transactions on Intelligent Vehicles +
+
+
+
+
+ + ♻ ☆ 3D Human Mesh Estimation from Virtual Markers CVPR 2023 + + +
+ Inspired by the success of volumetric 3D pose estimation, some recent human +mesh estimators propose to estimate 3D skeletons as intermediate +representations, from which, the dense 3D meshes are regressed by exploiting +the mesh topology. However, body shape information is lost in extracting +skeletons, leading to mediocre performance. The advanced motion capture systems +solve the problem by placing dense physical markers on the body surface, which +allows to extract realistic meshes from their non-rigid motions. However, they +cannot be applied to wild images without markers. In this work, we present an +intermediate representation, named virtual markers, which learns 64 landmark +keypoints on the body surface based on the large-scale mocap data in a +generative style, mimicking the effects of physical markers. The virtual +markers can be accurately detected from wild images and can reconstruct the +intact meshes with realistic shapes by simple interpolation. Our approach +outperforms the state-of-the-art methods on three datasets. In particular, it +surpasses the existing methods by a notable margin on the SURREAL dataset, +which has diverse body shapes. Code is available at +https://github.com/ShirleyMaxx/VirtualMarker + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ♻ ☆ SemanticFormer: Holistic and Semantic Traffic Scene Representation for + Trajectory Prediction using Knowledge Graphs + + +
+ Trajectory prediction in autonomous driving relies on accurate representation +of all relevant contexts of the driving scene, including traffic participants, +road topology, traffic signs, as well as their semantic relations to each +other. Despite increased attention to this issue, most approaches in trajectory +prediction do not consider all of these factors sufficiently. We present +SemanticFormer, an approach for predicting multimodal trajectories by reasoning +over a semantic traffic scene graph using a hybrid approach. It utilizes +high-level information in the form of meta-paths, i.e. trajectories on which an +agent is allowed to drive from a knowledge graph which is then processed by a +novel pipeline based on multiple attention mechanisms to predict accurate +trajectories. SemanticFormer comprises a hierarchical heterogeneous graph +encoder to capture spatio-temporal and relational information across agents as +well as between agents and road elements. Further, it includes a predictor to +fuse different encodings and decode trajectories with probabilities. Finally, a +refinement module assesses permitted meta-paths of trajectories and speed +profiles to obtain final predicted trajectories. Evaluation of the nuScenes +benchmark demonstrates improved performance compared to several SOTA methods. +In addition, we demonstrate that our knowledge graph can be easily added to two +graph-based existing SOTA methods, namely VectorNet and Laformer, replacing +their original homogeneous graphs. The evaluation results suggest that by +adding our knowledge graph the performance of the original methods is enhanced +by 5% and 4%, respectively. + +
+
+ comment: 8 pages, 7 figures, has been accepted for publication in the IEEE + Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ♻ ☆ DifAttack++: Query-Efficient Black-Box Adversarial Attack via + Hierarchical Disentangled Feature Space in Cross-Domain AAAI24 + + +
+ This work investigates efficient score-based black-box adversarial attacks +with a high Attack Success Rate (\textbf{ASR}) and good generalizability. We +design a novel attack method based on a hierarchical DIsentangled Feature +space, called \textbf{DifAttack++}, which differs significantly from the +existing ones operating over the entire feature space. Specifically, +DifAttack++ firstly disentangles an image's latent feature into an Adversarial +Feature (\textbf{AF}) and a Visual Feature (\textbf{VF}) via an autoencoder +equipped with our specially designed Hierarchical Decouple-Fusion +(\textbf{HDF}) module, where the AF dominates the adversarial capability of an +image, while the VF largely determines its visual appearance. We train such two +autoencoders for the clean and adversarial image domains (i.e., cross-domain) +respectively to achieve image reconstructions and feature disentanglement, by +using pairs of clean images and their Adversarial Examples (\textbf{AE}s) +generated from available surrogate models via white-box attack methods. +Eventually, in the black-box attack stage, DifAttack++ iteratively optimizes +the AF according to the query feedback from the victim model until a successful +AE is generated, while keeping the VF unaltered. Extensive experimental results +demonstrate that our DifAttack++ leads to superior ASR and query efficiency +than state-of-the-art methods, meanwhile exhibiting much better visual quality +of AEs. The code is available at https://github.com/csjunjun/DifAttack.git. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2309.14585 An + extension of the AAAI24 paper "DifAttack: Query-Efficient Black-Box Attack + via Disentangled Feature Space." +
+
+
+
+
+ + ♻ ☆ ConsistI2V: Enhancing Visual Consistency for Image-to-Video Generation + + +
+ Image-to-video (I2V) generation aims to use the initial frame (alongside a +text prompt) to create a video sequence. A grand challenge in I2V generation is +to maintain visual consistency throughout the video: existing methods often +struggle to preserve the integrity of the subject, background, and style from +the first frame, as well as ensure a fluid and logical progression within the +video narrative. To mitigate these issues, we propose ConsistI2V, a +diffusion-based method to enhance visual consistency for I2V generation. +Specifically, we introduce (1) spatiotemporal attention over the first frame to +maintain spatial and motion consistency, (2) noise initialization from the +low-frequency band of the first frame to enhance layout consistency. These two +approaches enable ConsistI2V to generate highly consistent videos. We also +extend the proposed approaches to show their potential to improve consistency +in auto-regressive long video generation and camera motion control. To verify +the effectiveness of our method, we propose I2V-Bench, a comprehensive +evaluation benchmark for I2V generation. Our automatic and human evaluation +results demonstrate the superiority of ConsistI2V over existing methods. + +
+
+ comment: Project Page: https://tiger-ai-lab.github.io/ConsistI2V/ +
+
+
+
+
+ + ♻ ☆ Deep Active Audio Feature Learning in Resource-Constrained Environments + + +
+ The scarcity of labelled data makes training Deep Neural Network (DNN) models +in bioacoustic applications challenging. In typical bioacoustics applications, +manually labelling the required amount of data can be prohibitively expensive. +To effectively identify both new and current classes, DNN models must continue +to learn new features from a modest amount of fresh data. Active Learning (AL) +is an approach that can help with this learning while requiring little +labelling effort. Nevertheless, the use of fixed feature extraction approaches +limits feature quality, resulting in underutilization of the benefits of AL. We +describe an AL framework that addresses this issue by incorporating feature +extraction into the AL loop and refining the feature extractor after each round +of manual annotation. In addition, we use raw audio processing rather than +spectrograms, which is a novel approach. Experiments reveal that the proposed +AL framework requires 14.3%, 66.7%, and 47.4% less labelling effort on +benchmark audio datasets ESC-50, UrbanSound8k, and InsectWingBeat, +respectively, for a large DNN model and similar savings on a +microcontroller-based counterpart. Furthermore, we showcase the practical +relevance of our study by incorporating data from conservation biology +projects. All codes are publicly available on GitHub. + +
+
+
+
+
+ + ♻ ☆ Scene Graph Generation in Large-Size VHR Satellite Imagery: A + Large-Scale Dataset and A Context-Aware Approach + + +
+ Scene graph generation (SGG) in satellite imagery (SAI) benefits promoting +intelligent understanding of geospatial scenarios from perception to cognition. +In SAI, objects exhibit great variations in scales and aspect ratios, and there +exist rich relationships between objects (even between spatially disjoint +objects), which makes it necessary to holistically conduct SGG in large-size +very-high-resolution (VHR) SAI. However, the lack of SGG datasets with +large-size VHR SAI has constrained the advancement of SGG in SAI. Due to the +complexity of large-size VHR SAI, mining triplets in large-size VHR SAI heavily relies on long-range contextual +reasoning. Consequently, SGG models designed for small-size natural imagery are +not directly applicable to large-size VHR SAI. To address the scarcity of +datasets, this paper constructs a large-scale dataset for SGG in large-size VHR +SAI with image sizes ranging from 512 x 768 to 27,860 x 31,096 pixels, named +RSG, encompassing over 210,000 objects and more than 400,000 triplets. To +realize SGG in large-size VHR SAI, we propose a context-aware cascade cognition +(CAC) framework to understand SAI at three levels: object detection (OBD), pair +pruning and relationship prediction. As a fundamental prerequisite for SGG in +large-size SAI, a holistic multi-class object detection network (HOD-Net) that +can flexibly integrate multi-scale contexts is proposed. With the consideration +that there exist a huge amount of object pairs in large-size SAI but only a +minority of object pairs contain meaningful relationships, we design a pair +proposal generation (PPG) network via adversarial reconstruction to select +high-value pairs. Furthermore, a relationship prediction network with +context-aware messaging (RPCM) is proposed to predict the relationship types of +these pairs. + +
+
+ comment: This paper releases a SAI-oriented SGG toolkit with about 30 OBD + methods and 10 SGG methods, and develops a benchmark based on RSG where our + HOD-Net and RPCM significantly outperform the state-of-the-art methods in + both OBD and SGG tasks. The RSG dataset and SAI-oriented toolkit will be made + publicly available at https://linlin-dev.github.io/project/RSG +
+
+
+
+
+ + ♻ ☆ Long Context Transfer from Language to Vision + + +
+ Video sequences offer valuable temporal information, but existing large +multimodal models (LMMs) fall short in understanding extremely long videos. +Many works address this by reducing the number of visual tokens using visual +resamplers. Alternatively, in this paper, we approach this problem from the +perspective of the language model. By simply extrapolating the context length +of the language backbone, we enable LMMs to comprehend orders of magnitude more +visual tokens without any video training. We call this phenomenon long context +transfer and carefully ablate its properties. To effectively measure LMMs' +ability to generalize to long contexts in the vision modality, we develop +V-NIAH (Visual Needle-In-A-Haystack), a purely synthetic long vision benchmark +inspired by the language model's NIAH test. Our proposed Long Video Assistant +(LongVA) can process 2000 frames or over 200K visual tokens without additional +complexities. With its extended context length, LongVA achieves +state-of-the-art performance on Video-MME among 7B-scale models by densely +sampling more input frames. Our work is open-sourced at +https://github.com/EvolvingLMMs-Lab/LongVA. + +
+
+ comment: Code, demo, and models are available at + https://github.com/EvolvingLMMs-Lab/LongVA +
+
+
+
+
+ + ♻ ☆ EgoVideo: Exploring Egocentric Foundation Model and Downstream + Adaptation CVPR 2024 + + +
+ In this report, we present our solutions to the EgoVis Challenges in CVPR +2024, including five tracks in the Ego4D challenge and three tracks in the +EPIC-Kitchens challenge. Building upon the video-language two-tower model and +leveraging our meticulously organized egocentric video data, we introduce a +novel foundation model called EgoVideo. This model is specifically designed to +cater to the unique characteristics of egocentric videos and provides strong +support for our competition submissions. In the Ego4D challenges, we tackle +various tasks including Natural Language Queries, Step Grounding, Moment +Queries, Short-term Object Interaction Anticipation, and Long-term Action +Anticipation. In addition, we also participate in the EPIC-Kitchens challenge, +where we engage in the Action Recognition, Multiple Instance Retrieval, and +Domain Adaptation for Action Recognition tracks. By adapting EgoVideo to these +diverse tasks, we showcase its versatility and effectiveness in different +egocentric video analysis scenarios, demonstrating the powerful representation +ability of EgoVideo as an egocentric foundation model. Our codebase and +pretrained models are publicly available at +https://github.com/OpenGVLab/EgoVideo. + +
+
+ comment: Champion solutions in the EgoVis CVPR 2024 workshop +
+
+
+
+
+ + ♻ ☆ Video Anomaly Detection in 10 Years: A Survey and Outlook + + +
+ Video anomaly detection (VAD) holds immense importance across diverse domains +such as surveillance, healthcare, and environmental monitoring. While numerous +surveys focus on conventional VAD methods, they often lack depth in exploring +specific approaches and emerging trends. This survey explores deep +learning-based VAD, expanding beyond traditional supervised training paradigms +to encompass emerging weakly supervised, self-supervised, and unsupervised +approaches. A prominent feature of this review is the investigation of core +challenges within the VAD paradigms including large-scale datasets, features +extraction, learning methods, loss functions, regularization, and anomaly score +prediction. Moreover, this review also investigates the vision language models +(VLMs) as potent feature extractors for VAD. VLMs integrate visual data with +textual descriptions or spoken language from videos, enabling a nuanced +understanding of scenes crucial for anomaly detection. By addressing these +challenges and proposing future research directions, this review aims to foster +the development of robust and efficient VAD systems leveraging the capabilities +of VLMs for enhanced anomaly detection in complex real-world scenarios. This +comprehensive analysis seeks to bridge existing knowledge gaps, provide +researchers with valuable insights, and contribute to shaping the future of VAD +research. + +
+
+
+
+
+ + ♻ ☆ Is Synthetic Data all We Need? Benchmarking the Robustness of Models + Trained with Synthetic Images CVPR 2024 + + +
+ A long-standing challenge in developing machine learning approaches has been +the lack of high-quality labeled data. Recently, models trained with purely +synthetic data, here termed synthetic clones, generated using large-scale +pre-trained diffusion models have shown promising results in overcoming this +annotation bottleneck. As these synthetic clone models progress, they are +likely to be deployed in challenging real-world settings, yet their suitability +remains understudied. Our work addresses this gap by providing the first +benchmark for three classes of synthetic clone models, namely supervised, +self-supervised, and multi-modal ones, across a range of robustness measures. +We show that existing synthetic self-supervised and multi-modal clones are +comparable to or outperform state-of-the-art real-image baselines for a range +of robustness metrics - shape bias, background bias, calibration, etc. However, +we also find that synthetic clones are much more susceptible to adversarial and +real-world noise than models trained with real data. To address this, we find +that combining both real and synthetic data further increases the robustness, +and that the choice of prompt used for generating synthetic images plays an +important part in the robustness of synthetic clones. + +
+
+ comment: Accepted at CVPR 2024 Workshop: SyntaGen-Harnessing Generative Models + for Synthetic Visual Datasets. Project page at + https://synbenchmark.github.io/SynCloneBenchmark Comments: Fix typo in Fig. 1 +
+
+
+
+
+ + ♻ ☆ SketchQL Demonstration: Zero-shot Video Moment Querying with Sketches + + +
+ In this paper, we will present SketchQL, a video database management system +(VDBMS) for retrieving video moments with a sketch-based query interface. This +novel interface allows users to specify object trajectory events with simple +mouse drag-and-drop operations. Users can use trajectories of single objects as +building blocks to compose complex events. Using a pre-trained model that +encodes trajectory similarity, SketchQL achieves zero-shot video moments +retrieval by performing similarity searches over the video to identify clips +that are the most similar to the visual query. In this demonstration, we +introduce the graphic user interface of SketchQL and detail its functionalities +and interaction mechanisms. We also demonstrate the end-to-end usage of +SketchQL from query composition to video moments retrieval using real-world +scenarios. + +
+
+
+
+
+ + ♻ ☆ A Survey on Deep Clustering: From the Prior Perspective + + +
+ Facilitated by the powerful feature extraction ability of neural networks, +deep clustering has achieved great success in analyzing high-dimensional and +complex real-world data. The performance of deep clustering methods is affected +by various factors such as network structures and learning objectives. However, +as pointed out in this survey, the essence of deep clustering lies in the +incorporation and utilization of prior knowledge, which is largely ignored by +existing works. From pioneering deep clustering methods based on data structure +assumptions to recent contrastive clustering methods based on data augmentation +invariances, the development of deep clustering intrinsically corresponds to +the evolution of prior knowledge. In this survey, we provide a comprehensive +review of deep clustering methods by categorizing them into six types of prior +knowledge. We find that in general the prior innovation follows two trends, +namely, i) from mining to constructing, and ii) from internal to external. +Besides, we provide a benchmark on five widely-used datasets and analyze the +performance of methods with diverse priors. By providing a novel prior +knowledge perspective, we hope this survey could provide some novel insights +and inspire future research in the deep clustering community. + +
+
+
+
+
+ + ♻ ☆ Harnessing the Power of MLLMs for Transferable Text-to-Image Person ReID CVPR 2024 + + +
+ Text-to-image person re-identification (ReID) retrieves pedestrian images +according to textual descriptions. Manually annotating textual descriptions is +time-consuming, restricting the scale of existing datasets and therefore the +generalization ability of ReID models. As a result, we study the transferable +text-to-image ReID problem, where we train a model on our proposed large-scale +database and directly deploy it to various datasets for evaluation. We obtain +substantial training data via Multi-modal Large Language Models (MLLMs). +Moreover, we identify and address two key challenges in utilizing the obtained +textual descriptions. First, an MLLM tends to generate descriptions with +similar structures, causing the model to overfit specific sentence patterns. +Thus, we propose a novel method that uses MLLMs to caption images according to +various templates. These templates are obtained using a multi-turn dialogue +with a Large Language Model (LLM). Therefore, we can build a large-scale +dataset with diverse textual descriptions. Second, an MLLM may produce +incorrect descriptions. Hence, we introduce a novel method that automatically +identifies words in a description that do not correspond with the image. This +method is based on the similarity between one text and all patch token +embeddings in the image. Then, we mask these words with a larger probability in +the subsequent training epoch, alleviating the impact of noisy textual +descriptions. The experimental results demonstrate that our methods +significantly boost the direct transfer text-to-image ReID performance. +Benefiting from the pre-trained model weights, we also achieve state-of-the-art +performance in the traditional evaluation settings. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Geometry-Aware Score Distillation via 3D Consistent Noising and Gradient + Consistency Modeling + + +
+ Score distillation sampling (SDS), the methodology in which the score from +pretrained 2D diffusion models is distilled into 3D representation, has +recently brought significant advancements in text-to-3D generation task. +However, this approach is still confronted with critical geometric +inconsistency problems such as the Janus problem. Starting from a hypothesis +that such inconsistency problems may be induced by multiview inconsistencies +between 2D scores predicted from various viewpoints, we introduce GSD, a simple +and general plug-and-play framework for incorporating 3D consistency and +therefore geometry awareness into the SDS process. Our methodology is composed +of three components: 3D consistent noising, designed to produce 3D consistent +noise maps that perfectly follow the standard Gaussian distribution, +geometry-based gradient warping for identifying correspondences between +predicted gradients of different viewpoints, and novel gradient consistency +loss to optimize the scene geometry toward producing more consistent gradients. +We demonstrate that our method significantly improves performance, successfully +addressing the geometric inconsistency problems in text-to-3D generation task +with minimal computation cost and being compatible with existing score +distillation-based models. Our project page is available at +https://ku-cvlab.github.io/GSD/. + +
+
+
+
+
+ + ♻ ☆ GRACE: Graph-Regularized Attentive Convolutional Entanglement with + Laplacian Smoothing for Robust DeepFake Video Detection + + +
+ As DeepFake video manipulation techniques escalate, posing profound threats, +the urgent need to develop efficient detection strategies is underscored. +However, one particular issue lies with facial images being mis-detected, often +originating from degraded videos or adversarial attacks, leading to unexpected +temporal artifacts that can undermine the efficacy of DeepFake video detection +techniques. This paper introduces a novel method for robust DeepFake video +detection, harnessing the power of the proposed Graph-Regularized Attentive +Convolutional Entanglement (GRACE) based on the graph convolutional network +with graph Laplacian to address the aforementioned challenges. First, +conventional Convolution Neural Networks are deployed to perform spatiotemporal +features for the entire video. Then, the spatial and temporal features are +mutually entangled by constructing a graph with sparse constraint, enforcing +essential features of valid face images in the noisy face sequences remaining, +thus augmenting stability and performance for DeepFake video detection. +Furthermore, the Graph Laplacian prior is proposed in the graph convolutional +network to remove the noise pattern in the feature space to further improve the +performance. Comprehensive experiments are conducted to illustrate that our +proposed method delivers state-of-the-art performance in DeepFake video +detection under noisy face sequences. The source code is available at +https://github.com/ming053l/GRACE. + +
+
+ comment: Submitted to TPAMI 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ Investigating Nudges toward Related Sellers on E-commerce Marketplaces: + A Case Study on Amazon SC + + +
+ E-commerce marketplaces provide business opportunities to millions of sellers +worldwide. Some of these sellers have special relationships with the +marketplace by virtue of using their subsidiary services (e.g., fulfillment +and/or shipping services provided by the marketplace) -- we refer to such +sellers collectively as Related Sellers. When multiple sellers offer to sell +the same product, the marketplace helps a customer in selecting an offer (by a +seller) through (a) a default offer selection algorithm, (b) showing features +about each of the offers and the corresponding sellers (price, seller +performance metrics, seller's number of ratings etc.), and (c) finally +evaluating the sellers along these features. In this paper, we perform an +end-to-end investigation into how the above apparatus can nudge customers +toward the Related Sellers on Amazon's four different marketplaces in India, +USA, Germany and France. We find that given explicit choices, customers' +preferred offers and algorithmically selected offers can be significantly +different. We highlight that Amazon is adopting different performance metric +evaluation policies for different sellers, potentially benefiting Related +Sellers. For instance, such policies result in notable discrepancy between the +actual performance metric and the presented performance metric of Related +Sellers. We further observe that among the seller-centric features visible to +customers, sellers' number of ratings influences their decisions the most, yet +it may not reflect the true quality of service by the seller, rather reflecting +the scale at which the seller operates, thereby implicitly steering customers +toward larger Related Sellers. Moreover, when customers are shown the rectified +metrics for the different sellers, their preference toward Related Sellers is +almost halved. + +
+
+ comment: This work has been accepted for presentation at the ACM Conference on + Computer-Supported Cooperative Work and Social Computing (CSCW) 2024. It will + appear in Proceedings of the ACM on Human-Computer Interaction +
+
+
+
+
+ + ☆ POST: Email Archival, Processing and Flagging Stack for Incident + Responders + + +
+ Phishing is one of the main points of compromise, with email security and +awareness being estimated at \$50-100B in 2022. There is great need for email +forensics capability to quickly search for malicious content. A novel solution +POST is proposed. POST is an API driven serverless email archival, processing, +and flagging workflow for both large and small organizations that collects and +parses all email, flags emails using state of the art Natural Language +Processing and Machine Learning, allows full email searching on every aspect of +an email, and provides a cost savings of up to 68.6%. + +
+
+ comment: This work was performed under the auspices of the U.S. Department of + Energy by Lawrence Livermore National Laboratory under Contract + DE-AC52-07NA27344. For further information or questions please reach out to + fairbanks6@llnl.gov +
+
+
+
+
+ + ☆ A Global-Local Attention Mechanism for Relation Classification + + +
+ Relation classification, a crucial component of relation extraction, involves +identifying connections between two entities. Previous studies have +predominantly focused on integrating the attention mechanism into relation +classification at a global scale, overlooking the importance of the local +context. To address this gap, this paper introduces a novel global-local +attention mechanism for relation classification, which enhances global +attention with a localized focus. Additionally, we propose innovative hard and +soft localization mechanisms to identify potential keywords for local +attention. By incorporating both hard and soft localization strategies, our +approach offers a more nuanced and comprehensive understanding of the +contextual cues that contribute to effective relation classification. Our +experimental results on the SemEval-2010 Task 8 dataset highlight the superior +performance of our method compared to previous attention-based approaches in +relation classification. + +
+
+ comment: This paper has been accepted by the 2024 20th International + Conference on Natural Computation, Fuzzy Systems and Knowledge Discovery + (ICNC-FSKD) +
+
+
+
+
+ + ☆ Optimization of Retrieval-Augmented Generation Context with Outlier + Detection + + +
+ In this paper, we focus on methods to reduce the size and improve the quality +of the prompt context required for question-answering systems. Attempts to +increase the number of retrieved chunked documents and thereby enlarge the +context related to the query can significantly complicate the processing and +decrease the performance of a Large Language Model (LLM) when generating +responses to queries. It is well known that a large set of documents retrieved +from a database in response to a query may contain irrelevant information, +which often leads to hallucinations in the resulting answers. Our goal is to +select the most semantically relevant documents, treating the discarded ones as +outliers. We propose and evaluate several methods for identifying outliers by +creating features that utilize the distances of embedding vectors, retrieved +from the vector database, to both the centroid and the query vectors. The +methods were evaluated by comparing the similarities of the retrieved LLM +responses to ground-truth answers obtained using the OpenAI GPT-4o model. It +was found that the greatest improvements were achieved with increasing +complexity of the questions and answers. + +
+
+
+
+
+ + ☆ Evaluation of Temporal Change in IR Test Collections + + +
+ Information retrieval systems have been evaluated using the Cranfield +paradigm for many years. This paradigm allows a systematic, fair, and +reproducible evaluation of different retrieval methods in fixed experimental +environments. However, real-world retrieval systems must cope with dynamic +environments and temporal changes that affect the document collection, topical +trends, and the individual user's perception of what is considered relevant. +Yet, the temporal dimension in IR evaluations is still understudied. + To this end, this work investigates how the temporal generalizability of +effectiveness evaluations can be assessed. As a conceptual model, we generalize +Cranfield-type experiments to the temporal context by classifying the change in +the essential components according to the create, update, and delete operations +of persistent storage known from CRUD. From the different types of change +different evaluation scenarios are derived and it is outlined what they imply. +Based on these scenarios, renowned state-of-the-art retrieval systems are +tested and it is investigated how the retrieval effectiveness changes on +different levels of granularity. + We show that the proposed measures can be well adapted to describe the +changes in the retrieval results. The experiments conducted confirm that the +retrieval effectiveness strongly depends on the evaluation scenario +investigated. We find that not only the average retrieval performance of single +systems but also the relative system performance are strongly affected by the +components that change and to what extent these components changed. + +
+
+
+
+
+ + ☆ BERGEN: A Benchmarking Library for Retrieval-Augmented Generation + + +
+ Retrieval-Augmented Generation allows to enhance Large Language Models with +external knowledge. In response to the recent popularity of generative LLMs, +many RAG approaches have been proposed, which involve an intricate number of +different configurations such as evaluation datasets, collections, metrics, +retrievers, and LLMs. Inconsistent benchmarking poses a major challenge in +comparing approaches and understanding the impact of each component in the +pipeline. In this work, we study best practices that lay the groundwork for a +systematic evaluation of RAG and present BERGEN, an end-to-end library for +reproducible research standardizing RAG experiments. In an extensive study +focusing on QA, we benchmark different state-of-the-art retrievers, rerankers, +and LLMs. Additionally, we analyze existing RAG metrics and datasets. Our +open-source library BERGEN is available under +\url{https://github.com/naver/bergen}. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Deep Domain Specialisation for single-model multi-domain learning to + rank + + +
+ Information Retrieval (IR) practitioners often train separate ranking models +for different domains (geographic regions, languages, stores, websites,...) as +it is believed that exclusively training on in-domain data yields the best +performance when sufficient data is available. Despite their performance gains, +training multiple models comes at a higher cost to train, maintain and update +compared to having only a single model responsible for all domains. Our work +explores consolidated ranking models that serve multiple domains. Specifically, +we propose a novel architecture of Deep Domain Specialisation (DDS) to +consolidate multiple domains into a single model. We compare our proposal +against Deep Domain Adaptation (DDA) and a set of baseline for multi-domain +models. In our experiments, DDS performed the best overall while requiring +fewer parameters per domain as other baselines. We show the efficacy of our +method both with offline experimentation and on a large-scale online experiment +on Amazon customer traffic. + +
+
+
+
+
+ + ☆ ProductAgent: Benchmarking Conversational Product Search Agent with + Asking Clarification Questions + + +
+ This paper introduces the task of product demand clarification within an +e-commercial scenario, where the user commences the conversation with ambiguous +queries and the task-oriented agent is designed to achieve more accurate and +tailored product searching by asking clarification questions. To address this +task, we propose ProductAgent, a conversational information seeking agent +equipped with abilities of strategic clarification question generation and +dynamic product retrieval. Specifically, we develop the agent with strategies +for product feature summarization, query generation, and product retrieval. +Furthermore, we propose the benchmark called PROCLARE to evaluate the agent's +performance both automatically and qualitatively with the aid of a LLM-driven +user simulator. Experiments show that ProductAgent interacts positively with +the user and enhances retrieval performance with increasing dialogue turns, +where user demands become gradually more explicit and detailed. All the source +codes will be released after the review anonymity period. + +
+
+ comment: 17 pages, 13 tables, 6 figures. Under review +
+
+
+
+
+ + ☆ Unified Dual-Intent Translation for Joint Modeling of Search and + Recommendation + + +
+ Recommendation systems, which assist users in discovering their preferred +items among numerous options, have served billions of users across various +online platforms. Intuitively, users' interactions with items are highly driven +by their unchanging inherent intents (e.g., always preferring high-quality +items) and changing demand intents (e.g., wanting a T-shirt in summer but a +down jacket in winter). However, both types of intents are implicitly expressed +in recommendation scenario, posing challenges in leveraging them for accurate +intent-aware recommendations. Fortunately, in search scenario, often found +alongside recommendation on the same online platform, users express their +demand intents explicitly through their query words. Intuitively, in both +scenarios, a user shares the same inherent intent and the interactions may be +influenced by the same demand intent. It is therefore feasible to utilize the +interaction data from both scenarios to reinforce the dual intents for joint +intent-aware modeling. But the joint modeling should deal with two problems: 1) +accurately modeling users' implicit demand intents in recommendation; 2) +modeling the relation between the dual intents and the interactive items. To +address these problems, we propose a novel model named Unified Dual-Intents +Translation for joint modeling of Search and Recommendation (UDITSR). To +accurately simulate users' demand intents in recommendation, we utilize real +queries from search data as supervision information to guide its generation. To +explicitly model the relation among the triplet , we propose a dual-intent translation propagation +mechanism to learn the triplet in the same semantic space via embedding +translations. Extensive experiments demonstrate that UDITSR outperforms SOTA +baselines both in search and recommendation tasks. + +
+
+
+
+
+ + ☆ Heterogeneous Graph-based Framework with Disentangled Representations + Learning for Multi-target Cross Domain Recommendation + + +
+ CDR (Cross-Domain Recommendation), i.e., leveraging information from multiple +domains, is a critical solution to data sparsity problem in recommendation +system. The majority of previous research either focused on single-target CDR +(STCDR) by utilizing data from the source domains to improve the model's +performance on the target domain, or applied dual-target CDR (DTCDR) by +integrating data from the source and target domains. In addition, multi-target +CDR (MTCDR) is a generalization of DTCDR, which is able to capture the link +among different domains. In this paper we present HGDR (Heterogeneous +Graph-based Framework with Disentangled Representations Learning), an +end-to-end heterogeneous network architecture where graph convolutional layers +are applied to model relations among different domains, meanwhile utilizes the +idea of disentangling representation for domain-shared and domain-specifc +information. First, a shared heterogeneous graph is generated by gathering +users and items from several domains without any further side information. +Second, we use HGDR to compute disentangled representations for users and items +in all domains.Experiments on real-world datasets and online A/B tests prove +that our proposed model can transmit information among domains effectively and +reach the SOTA performance. + +
+
+
+
+
+ + ☆ Reducing False Discoveries in Statistically-Significant + Regional-Colocation Mining: A Summary of Results + + +
+ Given a set \emph{S} of spatial feature types, its feature instances, a study +area, and a neighbor relationship, the goal is to find pairs $<$a region +($r_{g}$), a subset \emph{C} of \emph{S}$>$ such that \emph{C} is a +statistically significant regional-colocation pattern in $r_{g}$. This problem +is important for applications in various domains including ecology, economics, +and sociology. The problem is computationally challenging due to the +exponential number of regional colocation patterns and candidate regions. +Previously, we proposed a miner \cite{10.1145/3557989.3566158} that finds +statistically significant regional colocation patterns. However, the numerous +simultaneous statistical inferences raise the risk of false discoveries (also +known as the multiple comparisons problem) and carry a high computational cost. +We propose a novel algorithm, namely, multiple comparisons regional colocation +miner (MultComp-RCM) which uses a Bonferroni correction. Theoretical analysis, +experimental evaluation, and case study results show that the proposed method +reduces both the false discovery rate and computational cost. + +
+
+
+
+
+ + ♻ ☆ Amplify Graph Learning for Recommendation via Sparsity Completion + + +
+ Graph learning models have been widely deployed in collaborative filtering +(CF) based recommendation systems. Due to the issue of data sparsity, the graph +structure of the original input lacks potential positive preference edges, +which significantly reduces the performance of recommendations. In this paper, +we study how to enhance the graph structure for CF more effectively, thereby +optimizing the representation of graph nodes. Previous works introduced matrix +completion techniques into CF, proposing the use of either stochastic +completion methods or superficial structure completion to address this issue. +However, most of these approaches employ random numerical filling that lack +control over noise perturbations and limit the in-depth exploration of +higher-order interaction features of nodes, resulting in biased graph +representations. + In this paper, we propose an Amplify Graph Learning framework based on +Sparsity Completion (called AGL-SC). First, we utilize graph neural network to +mine direct interaction features between user and item nodes, which are used as +the inputs of the encoder. Second, we design a factorization-based method to +mine higher-order interaction features. These features serve as perturbation +factors in the latent space of the hidden layer to facilitate generative +enhancement. Finally, by employing the variational inference, the above +multi-order features are integrated to implement the completion and enhancement +of missing graph structures. We conducted benchmark and strategy experiments on +four real-world datasets related to recommendation tasks. The experimental +results demonstrate that AGL-SC significantly outperforms the state-of-the-art +methods. + +
+
+
+
+
+ + ♻ ☆ Autumn: A Scalable Read Optimized LSM-tree based Key-Value Stores with + Fast Point and Range Read Speed + + +
+ The Log Structured Merge Trees (LSM-tree) based key-value stores are widely +used in many storage systems to support a variety of operations such as +updates, point reads, and range reads. Traditionally, LSM-tree's merge policy +organizes data into multiple levels of exponentially increasing capacity to +support high-speed writes. However, we contend that the traditional merge +policies are not optimized for reads. In this work, we present Autumn, a +scalable and read optimized LSM-tree based key-value stores with minimal point +and range read cost. The key idea in improving the read performance is to +dynamically adjust the capacity ratio between two adjacent levels as more data +are stored. As a result, smaller levels gradually increase their capacities and +merge more often. In particular, the point and range read cost improves from +the previous best known $O(logN)$ complexity to $O(\sqrt{logN})$ in Autumn by +applying the novel Garnering merge policy. While Garnering merge policy +optimizes for both point reads and range reads, it maintains high performance +for updates. Moreover, to further improve the update costs, Autumn uses a small +amount of bounded space of DRAM to pin/keep the first level of LSM-tree. We +implemented Autumn on top of LevelDB and experimentally showcases the gain in +performance for real world workloads. + +
+
+
+
+
+ + ♻ ☆ Bioptic -- A Target-Agnostic Potency-Based Small Molecules Search Engine + + +
+ Recent successes in virtual screening have been made possible by large models +and extensive chemical libraries. However, combining these elements is +challenging: the larger the model, the more expensive it is to run, making +ultra-large libraries unfeasible. To address this, we developed a +target-agnostic, efficacy-based molecule search model, which allows us to find +structurally dissimilar molecules with similar biological activities. We used +the best practices to design fast retrieval system, based on +processor-optimized SIMD instructions, enabling us to screen the ultra-large +40B Enamine REAL library with 100\% recall rate. We extensively benchmarked our +model and several state-of-the-art models for both speed performance and +retrieval quality of novel molecules. + +
+
+
+
+
+
+
+
+ + Machine Learning 85 + +
+
+
+ + ☆ Adaptive RKHS Fourier Features for Compositional Gaussian Process Models + + +
+ Deep Gaussian Processes (DGPs) leverage a compositional structure to model +non-stationary processes. DGPs typically rely on local inducing point +approximations across intermediate GP layers. Recent advances in DGP inference +have shown that incorporating global Fourier features from Reproducing Kernel +Hilbert Space (RKHS) can enhance the DGPs' capability to capture complex +non-stationary patterns. This paper extends the use of these features to +compositional GPs involving linear transformations. In particular, we introduce +Ordinary Differential Equation (ODE) -based RKHS Fourier features that allow +for adaptive amplitude and phase modulation through convolution operations. +This convolutional formulation relates our work to recently proposed deep +latent force models, a multi-layer structure designed for modelling nonlinear +dynamical systems. By embedding these adjustable RKHS Fourier features within a +doubly stochastic variational inference framework, our model exhibits improved +predictive performance across various regression tasks. + +
+
+
+
+
+ + ☆ Improving Multilingual Instruction Finetuning via Linguistically Natural + and Diverse Datasets + + +
+ Advancements in Large Language Models (LLMs) have significantly enhanced +instruction-following capabilities. However, most Instruction Fine-Tuning (IFT) +datasets are predominantly in English, limiting model performance in other +languages. Traditional methods for creating multilingual IFT datasets such as +translating existing English IFT datasets or converting existing NLP datasets +into IFT datasets by templating, struggle to capture linguistic nuances and +ensure prompt (instruction) diversity. To address this issue, we propose a +novel method for collecting multilingual IFT datasets that preserves linguistic +naturalness and ensures prompt diversity. This approach leverages +English-focused LLMs, monolingual corpora, and a scoring function to create +high-quality, diversified IFT datasets in multiple languages. Experiments +demonstrate that LLMs finetuned using these IFT datasets show notable +improvements in both generative and discriminative tasks, indicating enhanced +language comprehension by LLMs in non-English contexts. Specifically, on the +multilingual summarization task, LLMs using our IFT dataset achieved 17.57% and +15.23% improvements over LLMs fine-tuned with translation-based and +template-based datasets, respectively. + +
+
+
+
+
+ + ☆ Meerkat: Audio-Visual Large Language Model for Grounding in Space and + Time ECCV 2024 + + +
+ Leveraging Large Language Models' remarkable proficiency in text-based tasks, +recent works on Multi-modal LLMs (MLLMs) extend them to other modalities like +vision and audio. However, the progress in these directions has been mostly +focused on tasks that only require a coarse-grained understanding of the +audio-visual semantics. We present Meerkat, an audio-visual LLM equipped with a +fine-grained understanding of image and audio both spatially and temporally. +With a new modality alignment module based on optimal transport and a +cross-attention module that enforces audio-visual consistency, Meerkat can +tackle challenging tasks such as audio referred image grounding, image guided +audio temporal localization, and audio-visual fact-checking. Moreover, we +carefully curate a large dataset AVFIT that comprises 3M instruction tuning +samples collected from open-source datasets, and introduce MeerkatBench that +unifies five challenging audio-visual tasks. We achieve state-of-the-art +performance on all these downstream tasks with a relative improvement of up to +37.12%. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Adam-mini: Use Fewer Learning Rates To Gain More + + +
+ We propose Adam-mini, an optimizer that achieves on-par or better performance +than AdamW with 45% to 50% less memory footprint. Adam-mini reduces memory by +cutting down the learning rate resources in Adam (i.e., $1/\sqrt{v}$). We find +that $\geq$ 90% of these learning rates in $v$ could be harmlessly removed if +we (1) carefully partition the parameters into blocks following our proposed +principle on Hessian structure; (2) assign a single but good learning rate to +each parameter block. We further find that, for each of these parameter blocks, +there exists a single high-quality learning rate that can outperform Adam, +provided that sufficient resources are available to search it out. We then +provide one cost-effective way to find good learning rates and propose +Adam-mini. Empirically, we verify that Adam-mini performs on par or better than +AdamW on various language models sized from 125M to 7B for pre-training, +supervised fine-tuning, and RLHF. The reduced memory footprint of Adam-mini +also alleviates communication overheads among GPUs and CPUs, thereby increasing +throughput. For instance, Adam-mini achieves 49.6% higher throughput than AdamW +when pre-training Llama2-7B on $2\times$ A800-80GB GPUs, which saves 33% +wall-clock time for pre-training. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Assume People are More Rational than We Really are + + +
+ In order for AI systems to communicate effectively with people, they must +understand how we make decisions. However, people's decisions are not always +rational, so the implicit internal models of human decision-making in Large +Language Models (LLMs) must account for this. Previous empirical evidence seems +to suggest that these implicit models are accurate -- LLMs offer believable +proxies of human behavior, acting how we expect humans would in everyday +interactions. However, by comparing LLM behavior and predictions to a large +dataset of human decisions, we find that this is actually not the case: when +both simulating and predicting people's choices, a suite of cutting-edge LLMs +(GPT-4o & 4-Turbo, Llama-3-8B & 70B, Claude 3 Opus) assume that people are more +rational than we really are. Specifically, these models deviate from human +behavior and align more closely with a classic model of rational choice -- +expected value theory. Interestingly, people also tend to assume that other +people are rational when interpreting their behavior. As a consequence, when we +compare the inferences that LLMs and people draw from the decisions of others +using another psychological dataset, we find that these inferences are highly +correlated. Thus, the implicit decision-making models of LLMs appear to be +aligned with the human expectation that other people will act rationally, +rather than with how people actually act. + +
+
+
+
+
+ + ♻ ☆ Unmasking Bias in AI: A Systematic Review of Bias Detection and + Mitigation Strategies in Electronic Health Record-based Models + + +
+ Objectives: Leveraging artificial intelligence (AI) in conjunction with +electronic health records (EHRs) holds transformative potential to improve +healthcare. Yet, addressing bias in AI, which risks worsening healthcare +disparities, cannot be overlooked. This study reviews methods to detect and +mitigate diverse forms of bias in AI models developed using EHR data. Methods: +We conducted a systematic review following the Preferred Reporting Items for +Systematic Reviews and Meta-analyses (PRISMA) guidelines, analyzing articles +from PubMed, Web of Science, and IEEE published between January 1, 2010, and +Dec 17, 2023. The review identified key biases, outlined strategies for +detecting and mitigating bias throughout the AI model development process, and +analyzed metrics for bias assessment. Results: Of the 450 articles retrieved, +20 met our criteria, revealing six major bias types: algorithmic, confounding, +implicit, measurement, selection, and temporal. The AI models were primarily +developed for predictive tasks in healthcare settings. Four studies +concentrated on the detection of implicit and algorithmic biases employing +fairness metrics like statistical parity, equal opportunity, and predictive +equity. Sixty proposed various strategies for mitigating biases, especially +targeting implicit and selection biases. These strategies, evaluated through +both performance (e.g., accuracy, AUROC) and fairness metrics, predominantly +involved data collection and preprocessing techniques like resampling, +reweighting, and transformation. Discussion: This review highlights the varied +and evolving nature of strategies to address bias in EHR-based AI models, +emphasizing the urgent needs for the establishment of standardized, +generalizable, and interpretable methodologies to foster the creation of +ethical AI systems that promote fairness and equity in healthcare. + +
+
+ comment: Published in JAMIA Volume 31, Issue 5, May 2024 +
+
+
+
+
+ + ♻ ☆ Fine-tuning can cripple your foundation model; preserving features may + be the solution + + +
+ Pre-trained foundation models, due to their enormous capacity and exposure to +vast amounts of data during pre-training, are known to have learned plenty of +real-world concepts. An important step in making these pre-trained models +effective on downstream tasks is to fine-tune them on related datasets. While +various fine-tuning methods have been devised and have been shown to be highly +effective, we observe that a fine-tuned model's ability to recognize concepts +on tasks $\textit{different}$ from the downstream one is reduced significantly +compared to its pre-trained counterpart. This is an undesirable effect of +fine-tuning as a substantial amount of resources was used to learn these +pre-trained concepts in the first place. We call this phenomenon ''concept +forgetting'' and via experiments show that most end-to-end fine-tuning +approaches suffer heavily from this side effect. To this end, we propose a +simple fix to this problem by designing a new fine-tuning method called +$\textit{LDIFS}$ (short for $\ell_2$ distance in feature space) that, while +learning new concepts related to the downstream task, allows a model to +preserve its pre-trained knowledge as well. Through extensive experiments on 10 +fine-tuning tasks we show that $\textit{LDIFS}$ significantly reduces concept +forgetting. Additionally, we show that LDIFS is highly effective in performing +continual fine-tuning on a sequence of tasks as well, in comparison with both +fine-tuning as well as continual learning baselines. + +
+
+ comment: Published in TMLR: https://openreview.net/forum?id=kfhoeZCeW7 +
+
+
+
+
+ + ♻ ☆ Does Writing with Language Models Reduce Content Diversity? ICLR 2024 + + +
+ Large language models (LLMs) have led to a surge in collaborative writing +with model assistance. As different users incorporate suggestions from the same +model, there is a risk of decreased diversity in the produced content, +potentially limiting diverse perspectives in public discourse. In this work, we +measure the impact of co-writing on diversity via a controlled experiment, +where users write argumentative essays in three setups -- using a base LLM +(GPT3), a feedback-tuned LLM (InstructGPT), and writing without model help. We +develop a set of diversity metrics and find that writing with InstructGPT (but +not the GPT3) results in a statistically significant reduction in diversity. +Specifically, it increases the similarity between the writings of different +authors and reduces the overall lexical and content diversity. We additionally +find that this effect is mainly attributable to InstructGPT contributing less +diverse text to co-written essays. In contrast, the user-contributed text +remains unaffected by model collaboration. This suggests that the recent +improvement in generation quality from adapting models to human feedback might +come at the cost of more homogeneous and less diverse content. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Towards objective and systematic evaluation of bias in artificial + intelligence for medical imaging + + +
+ Artificial intelligence (AI) models trained using medical images for clinical +tasks often exhibit bias in the form of disparities in performance between +subgroups. Since not all sources of biases in real-world medical imaging data +are easily identifiable, it is challenging to comprehensively assess how those +biases are encoded in models, and how capable bias mitigation methods are at +ameliorating performance disparities. In this article, we introduce a novel +analysis framework for systematically and objectively investigating the impact +of biases in medical images on AI models. We developed and tested this +framework for conducting controlled in silico trials to assess bias in medical +imaging AI using a tool for generating synthetic magnetic resonance images with +known disease effects and sources of bias. The feasibility is showcased by +using three counterfactual bias scenarios to measure the impact of simulated +bias effects on a convolutional neural network (CNN) classifier and the +efficacy of three bias mitigation strategies. The analysis revealed that the +simulated biases resulted in expected subgroup performance disparities when the +CNN was trained on the synthetic datasets. Moreover, reweighing was identified +as the most successful bias mitigation strategy for this setup, and we +demonstrated how explainable AI methods can aid in investigating the +manifestation of bias in the model using this framework. Developing fair AI +models is a considerable challenge given that many and often unknown sources of +biases can be present in medical imaging datasets. In this work, we present a +novel methodology to objectively study the impact of biases and mitigation +strategies on deep learning pipelines, which can support the development of +clinical AI that is robust and responsible. + +
+
+ comment: Published in the Journal of the American Medical Informatics + Association +
+
+
+
+
+ + ♻ ☆ Evaluation of Deep Learning Semantic Segmentation for Land Cover Mapping + on Multispectral, Hyperspectral and High Spatial Aerial Imagery + + +
+ In the rise of climate change, land cover mapping has become such an urgent +need in environmental monitoring. The accuracy of land cover classification has +gotten increasingly based on the improvement of remote sensing data. Land cover +classification using satellite imageries has been explored and become more +prevalent in recent years, but the methodologies remain some drawbacks of +subjective and time-consuming. Some deep learning techniques have been utilized +to overcome these limitations. However, most studies implemented just one image +type to evaluate algorithms for land cover mapping. Therefore, our study +conducted deep learning semantic segmentation in multispectral, hyperspectral, +and high spatial aerial image datasets for landcover mapping. This research +implemented a semantic segmentation method such as Unet, Linknet, FPN, and +PSPnet for categorizing vegetation, water, and others (i.e., soil and +impervious surface). The LinkNet model obtained high accuracy in IoU +(Intersection Over Union) at 0.92 in all datasets, which is comparable with +other mentioned techniques. In evaluation with different image types, the +multispectral images showed higher performance with the IoU, and F1-score are +0.993 and 0.997, respectively. Our outcome highlighted the efficiency and broad +applicability of LinkNet and multispectral image on land cover classification. +This research contributes to establishing an approach on landcover segmentation +via open source for long-term future application. + +
+
+ comment: conference, This preprint is based on the following published + conference article: Panuntun, I. A., Chen, Y.-N., Jamaluddin, I., & Tran, T. + L. C., 2023. Evaluation of Deep Learning Semantic Segmentation for Land Cover + Mapping on Multispectral, Hyperspectral and High Spatial Aerial Imagery. 44th + Asian Conference on Remote Sensing, ACRS 2023. Code 198676 +
+
+
+
+
+ + ♻ ☆ Predicting Fairness of ML Software Configurations + + +
+ This paper investigates the relationships between hyperparameters of machine +learning and fairness. Data-driven solutions are increasingly used in critical +socio-technical applications where ensuring fairness is important. Rather than +explicitly encoding decision logic via control and data structures, the ML +developers provide input data, perform some pre-processing, choose ML +algorithms, and tune hyperparameters (HPs) to infer a program that encodes the +decision logic. Prior works report that the selection of HPs can significantly +influence fairness. However, tuning HPs to find an ideal trade-off between +accuracy, precision, and fairness has remained an expensive and tedious task. +Can we predict fairness of HP configuration for a given dataset? Are the +predictions robust to distribution shifts? + We focus on group fairness notions and investigate the HP space of 5 training +algorithms. We first find that tree regressors and XGBoots significantly +outperformed deep neural networks and support vector machines in accurately +predicting the fairness of HPs. When predicting the fairness of ML +hyperparameters under temporal distribution shift, the tree regressors +outperforms the other algorithms with reasonable accuracy. However, the +precision depends on the ML training algorithm, dataset, and protected +attributes. For example, the tree regressor model was robust for training data +shift from 2014 to 2018 on logistic regression and discriminant analysis HPs +with sex as the protected attribute; but not for race and other training +algorithms. Our method provides a sound framework to efficiently perform +fine-tuning of ML training algorithms and understand the relationships between +HPs and fairness. + +
+
+ comment: To Appear in the 20th International Conference on Predictive Models + and Data Analytics in Software Engineering (PROMISE'24) +
+
+
+
+
+ + ♻ ☆ Embedded FPGA Developments in 130nm and 28nm CMOS for Machine Learning + in Particle Detector Readout + + +
+ Embedded field programmable gate array (eFPGA) technology allows the +implementation of reconfigurable logic within the design of an +application-specific integrated circuit (ASIC). This approach offers the low +power and efficiency of an ASIC along with the ease of FPGA configuration, +particularly beneficial for the use case of machine learning in the data +pipeline of next-generation collider experiments. An open-source framework +called "FABulous" was used to design eFPGAs using 130 nm and 28 nm CMOS +technology nodes, which were subsequently fabricated and verified through +testing. The capability of an eFPGA to act as a front-end readout chip was +assessed using simulation of high energy particles passing through a silicon +pixel sensor. A machine learning-based classifier, designed for reduction of +sensor data at the source, was synthesized and configured onto the eFPGA. A +successful proof-of-concept was demonstrated through reproduction of the +expected algorithm result on the eFPGA with perfect accuracy. Further +development of the eFPGA technology and its application to collider detector +readout is discussed. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Cutting through buggy adversarial example defenses: fixing 1 line of + code breaks Sabre + + +
+ Sabre is a defense to adversarial examples that was accepted at IEEE S&P +2024. We first reveal significant flaws in the evaluation that point to clear +signs of gradient masking. We then show the cause of this gradient masking: a +bug in the original evaluation code. By fixing a single line of code in the +original repository, we reduce Sabre's robust accuracy to 0%. In response to +this, the authors modify the defense and introduce a new defense component not +described in the original paper. But this fix contains a second bug; modifying +one more line of code reduces robust accuracy to below baseline levels. After +we released the first version of our paper online, the authors introduced +another change to the defense; by commenting out one line of code during attack +we reduce the robust accuracy to 0% again. + +
+
+
+
+
+ + ♻ ☆ Affine Invariant Ensemble Transform Methods to Improve Predictive + Uncertainty in Neural Networks + + +
+ We consider the problem of performing Bayesian inference for logistic +regression using appropriate extensions of the ensemble Kalman filter. Two +interacting particle systems are proposed that sample from an approximate +posterior and prove quantitative convergence rates of these interacting +particle systems to their mean-field limit as the number of particles tends to +infinity. Furthermore, we apply these techniques and examine their +effectiveness as methods of Bayesian approximation for quantifying predictive +uncertainty in neural networks. + +
+
+
+
+
+ + ♻ ☆ Patch-Prompt Aligned Bayesian Prompt Tuning for Vision-Language Models UAI 2024 + + +
+ For downstream applications of vision-language pre-trained models, there has +been significant interest in constructing effective prompts. Existing works on +prompt engineering, which either require laborious manual designs or optimize +the prompt tuning as a point estimation problem, may fail to describe diverse +characteristics of categories and limit their applications. We introduce a +Bayesian probabilistic resolution to prompt tuning, where the label-specific +stochastic prompts are generated hierarchically by first sampling a latent +vector from an underlying distribution and then employing a lightweight +generative model. Importantly, we semantically regularize the tuning process by +minimizing the statistical distance between the visual patches and linguistic +prompts, which pushes the stochastic label representations to faithfully +capture diverse visual concepts, instead of overfitting the training +categories. We evaluate the effectiveness of our approach on four tasks: +few-shot image recognition, base-to-new generalization, dataset transfer +learning, and domain shifts. Extensive results over 15 datasets show promising +transferability and generalization performance of our proposed model, both +quantitatively and qualitatively. + +
+
+ comment: Accepted by UAI 2024 +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Meta-tuning for Few-shot Generalization Through + Sparse Interpolated Experts + + +
+ Recent successes suggest that parameter-efficient fine-tuning of foundation +models as the state-of-the-art method for transfer learning in vision, +replacing the rich literature of alternatives such as meta-learning. In trying +to harness the best of both worlds, meta-tuning introduces a subsequent +optimization stage of foundation models but has so far only shown limited +success and crucially tends to underperform on out-of-distribution (OOD) tasks. +In this paper, we introduce Sparse MetA-Tuning (SMAT), a method inspired by +sparse mixture-of-experts approaches and trained to isolate subsets of +pre-trained parameters automatically for meta-tuning on each task. SMAT +successfully overcomes OOD sensitivity and delivers on the promise of enhancing +the transfer abilities of vision foundation models beyond parameter-efficient +fine-tuning. We establish new state-of-the-art results on a challenging +combination of Meta-Dataset augmented with additional OOD tasks in both +zero-shot and gradient-based adaptation settings. In addition, we provide a +thorough analysis of the superiority of learned over hand-designed sparsity +patterns for sparse expert methods and the pivotal importance of the sparsity +level in balancing between in-distribution and out-of-distribution +generalization. Our code is publicly available. + +
+
+ comment: The Forty-first International Conference on Machine Learning, 2024 +
+
+
+
+
+ + ♻ ☆ Learning the boundary-to-domain mapping using Lifting Product Fourier + Neural Operators for partial differential equations ICML 2024 + + +
+ Neural operators such as the Fourier Neural Operator (FNO) have been shown to +provide resolution-independent deep learning models that can learn mappings +between function spaces. For example, an initial condition can be mapped to the +solution of a partial differential equation (PDE) at a future time-step using a +neural operator. Despite the popularity of neural operators, their use to +predict solution functions over a domain given only data over the boundary +(such as a spatially varying Dirichlet boundary condition) remains unexplored. +In this paper, we refer to such problems as boundary-to-domain problems; they +have a wide range of applications in areas such as fluid mechanics, solid +mechanics, heat transfer etc. We present a novel FNO-based architecture, named +Lifting Product FNO (or LP-FNO) which can map arbitrary boundary functions +defined on the lower-dimensional boundary to a solution in the entire domain. +Specifically, two FNOs defined on the lower-dimensional boundary are lifted +into the higher dimensional domain using our proposed lifting product layer. We +demonstrate the efficacy and resolution independence of the proposed LP-FNO for +the 2D Poisson equation. + +
+
+ comment: Accepted by ICML 2024 AI for Science Workshop +
+
+
+
+
+ + ♻ ☆ Safe Linear Bandits over Unknown Polytopes COLT 2024 + + +
+ The safe linear bandit problem (SLB) is an online approach to linear +programming with unknown objective and unknown roundwise constraints, under +stochastic bandit feedback of rewards and safety risks of actions. We study the +tradeoffs between efficacy and smooth safety costs of SLBs over polytopes, and +the role of aggressive doubly-optimistic play in avoiding the strong +assumptions made by extant pessimistic-optimistic approaches. + We first elucidate an inherent hardness in SLBs due the lack of knowledge of +constraints: there exist `easy' instances, for which suboptimal extreme points +have large `gaps', but on which SLB methods must still incur $\Omega(\sqrt{T})$ +regret or safety violations, due to an inability to resolve unknown optima to +arbitrary precision. We then analyse a natural doubly-optimistic strategy for +the safe linear bandit problem, DOSS, which uses optimistic estimates of both +reward and safety risks to select actions, and show that despite the lack of +knowledge of constraints or feasible points, DOSS simultaneously obtains tight +instance-dependent $O(\log^2 T)$ bounds on efficacy regret, and $\tilde +O(\sqrt{T})$ bounds on safety violations. Further, when safety is demanded to a +finite precision, violations improve to $O(\log^2 T).$ These results rely on a +novel dual analysis of linear bandits: we argue that \algoname proceeds by +activating noisy versions of at least $d$ constraints in each round, which +allows us to separately analyse rounds where a `poor' set of constraints is +activated, and rounds where `good' sets of constraints are activated. The costs +in the former are controlled to $O(\log^2 T)$ by developing new dual notions of +gaps, based on global sensitivity analyses of linear programs, that quantify +the suboptimality of each such set of constraints. The latter costs are +controlled to $O(1)$ by explicitly analysing the solutions of optimistic play. + +
+
+ comment: v3: Presented at COLT 2024 +
+
+
+
+
+ + ♻ ☆ Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated + Text + + +
+ Detecting text generated by modern large language models is thought to be +hard, as both LLMs and humans can exhibit a wide range of complex behaviors. +However, we find that a score based on contrasting two closely related language +models is highly accurate at separating human-generated and machine-generated +text. Based on this mechanism, we propose a novel LLM detector that only +requires simple calculations using a pair of pre-trained LLMs. The method, +called Binoculars, achieves state-of-the-art accuracy without any training +data. It is capable of spotting machine text from a range of modern LLMs +without any model-specific modifications. We comprehensively evaluate +Binoculars on a number of text sources and in varied situations. Over a wide +range of document types, Binoculars detects over 90% of generated samples from +ChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being +trained on any ChatGPT data. + +
+
+ comment: 20 pages, code available at https://github.com/ahans30/Binoculars +
+
+
+
+
+ + ♻ ☆ Stationary Kernels and Gaussian Processes on Lie Groups and their + Homogeneous Spaces II: non-compact symmetric spaces + + +
+ Gaussian processes are arguably the most important class of spatiotemporal +models within machine learning. They encode prior information about the modeled +function and can be used for exact or approximate Bayesian learning. In many +applications, particularly in physical sciences and engineering, but also in +areas such as geostatistics and neuroscience, invariance to symmetries is one +of the most fundamental forms of prior information one can consider. The +invariance of a Gaussian process' covariance to such symmetries gives rise to +the most natural generalization of the concept of stationarity to such spaces. +In this work, we develop constructive and practical techniques for building +stationary Gaussian processes on a very large class of non-Euclidean spaces +arising in the context of symmetries. Our techniques make it possible to (i) +calculate covariance kernels and (ii) sample from prior and posterior Gaussian +processes defined on such spaces, both in a practical manner. This work is +split into two parts, each involving different technical considerations: part I +studies compact spaces, while part II studies non-compact spaces possessing +certain structure. Our contributions make the non-Euclidean Gaussian process +models we study compatible with well-understood computational techniques +available in standard Gaussian process software packages, thereby making them +accessible to practitioners. + +
+
+
+
+
+ + ♻ ☆ Inverse Evolution Layers: Physics-informed Regularizers for Deep Neural + Networks + + +
+ Traditional image processing methods employing partial differential equations +(PDEs) offer a multitude of meaningful regularizers, along with valuable +theoretical foundations for a wide range of image-related tasks. This makes +their integration into neural networks a promising avenue. In this paper, we +introduce a novel regularization approach inspired by the reverse process of +PDE-based evolution models. Specifically, we propose inverse evolution layers +(IELs), which serve as bad property amplifiers to penalize neural networks of +which outputs have undesired characteristics. Using IELs, one can achieve +specific regularization objectives and endow neural networks' outputs with +corresponding properties of the PDE models. Our experiments, focusing on +semantic segmentation tasks using heat-diffusion IELs, demonstrate their +effectiveness in mitigating noisy label effects. Additionally, we develop +curve-motion IELs to enforce convex shape regularization in neural +network-based segmentation models for preventing the generation of concave +outputs. Theoretical analysis confirms the efficacy of IELs as an effective +regularization mechanism, particularly in handling training with label issues. + +
+
+
+
+
+ + ♻ ☆ On the Convergence of Multi-objective Optimization under Generalized + Smoothness + + +
+ Multi-objective optimization (MOO) is receiving more attention in various +fields such as multi-task learning. Recent works provide some effective +algorithms with theoretical analysis but they are limited by the standard +$L$-smooth or bounded-gradient assumptions, which are typically unsatisfactory +for neural networks, such as recurrent neural networks (RNNs) and transformers. +In this paper, we study a more general and realistic class of $\ell$-smooth +loss functions, where $\ell$ is a general non-decreasing function of gradient +norm. We develop two novel single-loop algorithms for $\ell$-smooth MOO +problems, Generalized Smooth Multi-objective Gradient descent (GSMGrad) and its +stochastic variant, Stochastic Generalized Smooth Multi-objective Gradient +descent (SGSMGrad), which approximate the conflict-avoidant (CA) direction that +maximizes the minimum improvement among objectives. We provide a comprehensive +convergence analysis of both algorithms and show that they converge to an +$\epsilon$-accurate Pareto stationary point with a guaranteed $\epsilon$-level +average CA distance (i.e., the gap between the updating direction and the CA +direction) over all iterations, where totally $\mathcal{O}(\epsilon^{-2})$ and +$\mathcal{O}(\epsilon^{-4})$ samples are needed for deterministic and +stochastic settings, respectively. Our algorithms can also guarantee a tighter +$\epsilon$-level CA distance in each iteration using more samples. Moreover, we +propose a practical variant of GSMGrad named GSMGrad-FA using only +constant-level time and space, while achieving the same performance guarantee +as GSMGrad. Our experiments validate our theory and demonstrate the +effectiveness of the proposed methods. + +
+
+
+
+
+ + ♻ ☆ Rethinking LLM Memorization through the Lens of Adversarial Compression + + +
+ Large language models (LLMs) trained on web-scale datasets raise substantial +concerns regarding permissible data usage. One major question is whether these +models "memorize" all their training data or they integrate many data sources +in some way more akin to how a human would learn and synthesize information. +The answer hinges, to a large degree, on how we define memorization. In this +work, we propose the Adversarial Compression Ratio (ACR) as a metric for +assessing memorization in LLMs. A given string from the training data is +considered memorized if it can be elicited by a prompt (much) shorter than the +string itself -- in other words, if these strings can be "compressed" with the +model by computing adversarial prompts of fewer tokens. The ACR overcomes the +limitations of existing notions of memorization by (i) offering an adversarial +view of measuring memorization, especially for monitoring unlearning and +compliance; and (ii) allowing for the flexibility to measure memorization for +arbitrary strings at a reasonably low compute. Our definition serves as a +practical tool for determining when model owners may be violating terms around +data usage, providing a potential legal tool and a critical lens through which +to address such scenarios. + +
+
+ comment: https://locuslab.github.io/acr-memorization +
+
+
+
+
+ + ♻ ☆ From Alexnet to Transformers: Measuring the Non-linearity of Deep Neural + Networks with Affine Optimal Transport + + +
+ In the last decade, we have witnessed the introduction of several novel deep +neural network (DNN) architectures exhibiting ever-increasing performance +across diverse tasks. Explaining the upward trend of their performance, +however, remains difficult as different DNN architectures of comparable depth +and width -- common factors associated with their expressive power -- may +exhibit a drastically different performance even when trained on the same +dataset. In this paper, we introduce the concept of the non-linearity signature +of DNN, the first theoretically sound solution for approximately measuring the +non-linearity of deep neural networks. Built upon a score derived from +closed-form optimal transport mappings, this signature provides a better +understanding of the inner workings of a wide range of DNN architectures and +learning paradigms, with a particular emphasis on the computer vision task. We +provide extensive experimental results that highlight the practical usefulness +of the proposed non-linearity signature and its potential for long-reaching +implications. The code for our work is available at +https://github.com/qbouniot/AffScoreDeep + +
+
+ comment: Code available at https://github.com/qbouniot/AffScoreDeep +
+
+
+
+
+ + ♻ ☆ Efficient Prompt Tuning by Multi-Space Projection and Prompt Fusion + + +
+ Prompt tuning is a promising method to fine-tune a pre-trained language model +without retraining its large-scale parameters. Instead, it attaches a soft +prompt to the input text, whereby downstream tasks can be well adapted by +merely learning the embeddings of prompt tokens. Nevertheless, existing methods +still suffer from two challenges: (i) they are hard to balance accuracy and +efficiency. A longer (shorter) soft prompt generally leads to a better(worse) +accuracy but at the cost of more (less) training time. (ii)The performance may +not be consistent when adapting to different downstream tasks. We attribute it +to the same embedding space but responsible for different requirements of +downstream tasks. To address these issues, we propose an Efficient Prompt +Tuning method (EPT) by multi-space projection and prompt fusion. Specifically, +it decomposes a given soft prompt into a shorter prompt and two low-rank +matrices, significantly reducing the training time. Accuracy is also enhanced +by leveraging low-rank matrices and the short prompt as additional knowledge +sources to enrich the semantics of the original short prompt. In addition, we +project the soft prompt into multiple subspaces to improve the performance +consistency, and then adaptively learn the combination weights of different +spaces through a gating network. Experiments on 13 natural language processing +downstream tasks show that our method significantly and consistently +outperforms 11 comparison methods with the relative percentage of improvements +up to 12.9%, and training time decreased by 14%. + +
+
+
+
+
+ + ♻ ☆ Decomposing Global Feature Effects Based on Feature Interactions + + +
+ Global feature effect methods, such as partial dependence plots, provide an +intelligible visualization of the expected marginal feature effect. However, +such global feature effect methods can be misleading, as they do not represent +local feature effects of single observations well when feature interactions are +present. We formally introduce generalized additive decomposition of global +effects (GADGET), which is a new framework based on recursive partitioning to +find interpretable regions in the feature space such that the +interaction-related heterogeneity of local feature effects is minimized. We +provide a mathematical foundation of the framework and show that it is +applicable to the most popular methods to visualize marginal feature effects, +namely partial dependence, accumulated local effects, and Shapley additive +explanations (SHAP) dependence. Furthermore, we introduce and validate a new +permutation-based interaction test to detect significant feature interactions +that is applicable to any feature effect method that fits into our proposed +framework. We empirically evaluate the theoretical characteristics of the +proposed methods based on various feature effect methods in different +experimental settings. Moreover, we apply our introduced methodology to three +real-world examples to showcase their usefulness. + +
+
+
+
+
+ + ♻ ☆ Federated Temporal Difference Learning with Linear Function + Approximation under Environmental Heterogeneity + + +
+ We initiate the study of federated reinforcement learning under environmental +heterogeneity by considering a policy evaluation problem. Our setup involves +$N$ agents interacting with environments that share the same state and action +space but differ in their reward functions and state transition kernels. +Assuming agents can communicate via a central server, we ask: Does exchanging +information expedite the process of evaluating a common policy? To answer this +question, we provide the first comprehensive finite-time analysis of a +federated temporal difference (TD) learning algorithm with linear function +approximation, while accounting for Markovian sampling, heterogeneity in the +agents' environments, and multiple local updates to save communication. Our +analysis crucially relies on several novel ingredients: (i) deriving +perturbation bounds on TD fixed points as a function of the heterogeneity in +the agents' underlying Markov decision processes (MDPs); (ii) introducing a +virtual MDP to closely approximate the dynamics of the federated TD algorithm; +and (iii) using the virtual MDP to make explicit connections to federated +optimization. Putting these pieces together, we rigorously prove that in a +low-heterogeneity regime, exchanging model estimates leads to linear +convergence speedups in the number of agents. + +
+
+
+
+
+ + ♻ ☆ Text2Robot: Evolutionary Robot Design from Text Descriptions + + +
+ Robot design has traditionally been costly and labor-intensive. Despite +advancements in automated processes, it remains challenging to navigate a vast +design space while producing physically manufacturable robots. We introduce +Text2Robot, a framework that converts user text specifications and performance +preferences into physical quadrupedal robots. Within minutes, Text2Robot can +use text-to-3D models to provide strong initializations of diverse +morphologies. Within a day, our geometric processing algorithms and +body-control co-optimization produce a walking robot by explicitly considering +real-world electronics and manufacturability. Text2Robot enables rapid +prototyping and opens new opportunities for robot design with generative +models. + +
+
+ comment: Our project website is at: http://generalroboticslab.com/Text2Robot +
+
+
+
+
+ + ♻ ☆ DCSI -- An improved measure of cluster separability based on separation + and connectedness + + +
+ Whether class labels in a given data set correspond to meaningful clusters is +crucial for the evaluation of clustering algorithms using real-world data sets. +This property can be quantified by separability measures. The central aspects +of separability for density-based clustering are between-class separation and +within-class connectedness, and neither classification-based complexity +measures nor cluster validity indices (CVIs) adequately incorporate them. A +newly developed measure (density cluster separability index, DCSI) aims to +quantify these two characteristics and can also be used as a CVI. Extensive +experiments on synthetic data indicate that DCSI correlates strongly with the +performance of DBSCAN measured via the adjusted Rand index (ARI) but lacks +robustness when it comes to multi-class data sets with overlapping classes that +are ill-suited for density-based hard clustering. Detailed evaluation on +frequently used real-world data sets shows that DCSI can correctly identify +touching or overlapping classes that do not correspond to meaningful +density-based clusters. + +
+
+
+
+
+ + ♻ ☆ Biology-inspired joint distribution neurons based on Hierarchical + Correlation Reconstruction allowing for multidirectional neural networks + + +
+ Biological neural networks seem qualitatively superior (e.g. in learning, +flexibility, robustness) from current artificial like Multi-Layer Perceptron +(MLP) or Kolmogorov-Arnold Network (KAN). Simultaneously, in contrast to them: +have fundamentally multidirectional signal propagation~\cite{axon}, also of +probability distributions e.g. for uncertainty estimation, and are believed not +being able to use standard backpropagation training~\cite{backprop}. There are +proposed novel artificial neurons based on HCR (Hierarchical Correlation +Reconstruction) removing the above low level differences: with neurons +containing local joint distribution model (of its connections), representing +joint density on normalized variables as just linear combination among +$(f_\mathbf{j})$ orthonormal polynomials: $\rho(\mathbf{x})=\sum_{\mathbf{j}\in +B} a_\mathbf{j} f_\mathbf{j}(\mathbf{x})$ for $\mathbf{x} \in [0,1]^d$ and $B$ +some chosen basis, with basis growth approaching complete description of joint +distribution. By various index summations of such $(a_\mathbf{j})$ tensor as +neuron parameters, we get simple formulas for e.g. conditional expected values +for propagation in any direction, like $E[x|y,z]$, $E[y|x]$, which degenerate +to KAN-like parametrization if restricting to pairwise dependencies. Such HCR +network can also propagate probability distributions (also joint) like +$\rho(y,z|x)$. It also allows for additional training approaches, like direct +$(a_\mathbf{j})$ estimation, through tensor decomposition, or more biologically +plausible information bottleneck training: layers directly influencing only +neighbors, optimizing content to maximize information about the next layer, and +minimizing about the previous to minimize the noise. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Robust Model-Based Reinforcement Learning with an Adversarial Auxiliary + Model + + +
+ Reinforcement learning has demonstrated impressive performance in various +challenging problems such as robotics, board games, and classical arcade games. +However, its real-world applications can be hindered by the absence of +robustness and safety in the learned policies. More specifically, an RL agent +that trains in a certain Markov decision process (MDP) often struggles to +perform well in nearly identical MDPs. To address this issue, we employ the +framework of Robust MDPs (RMDPs) in a model-based setting and introduce a novel +learned transition model. Our method specifically incorporates an auxiliary +pessimistic model, updated adversarially, to estimate the worst-case MDP within +a Kullback-Leibler uncertainty set. In comparison to several existing works, +our work does not impose any additional conditions on the training environment, +such as the need for a parametric simulator. To test the effectiveness of the +proposed pessimistic model in enhancing policy robustness, we integrate it into +a practical RL algorithm, called Robust Model-Based Policy Optimization +(RMBPO). Our experimental results indicate a notable improvement in policy +robustness on high-dimensional MuJoCo control tasks, with the auxiliary model +enhancing the performance of the learned policy in distorted MDPs. We further +explore the learned deviation between the proposed auxiliary world model and +the nominal model, to examine how pessimism is achieved. By learning a +pessimistic world model and demonstrating its role in improving policy +robustness, our research contributes towards making (model-based) RL more +robust. + +
+
+ comment: Will be presented at the RL Safety Workshop at RLC 2024 +
+
+
+
+
+ + ♻ ☆ Connectivity Oracles for Predictable Vertex Failures + + +
+ The problem of designing connectivity oracles supporting vertex failures is +one of the basic data structures problems for undirected graphs. It is already +well understood: previous works [Duan--Pettie STOC'10; Long--Saranurak FOCS'22] +achieve query time linear in the number of failed vertices, and it is +conditionally optimal as long as we require preprocessing time polynomial in +the size of the graph and update time polynomial in the number of failed +vertices. + We revisit this problem in the paradigm of algorithms with predictions: we +ask if the query time can be improved if the set of failed vertices can be +predicted beforehand up to a small number of errors. More specifically, we +design a data structure that, given a graph $G=(V,E)$ and a set of vertices +predicted to fail $\widehat{D} \subseteq V$ of size $d=|\widehat{D}|$, +preprocesses it in time $\tilde{O}(d|E|)$ and then can receive an update given +as the symmetric difference between the predicted and the actual set of failed +vertices $\widehat{D} \triangle D = (\widehat{D} \setminus D) \cup (D \setminus +\widehat{D})$ of size $\eta = |\widehat{D} \triangle D|$, process it in time +$\tilde{O}(\eta^4)$, and after that answer connectivity queries in $G \setminus +D$ in time $O(\eta)$. Viewed from another perspective, our data structure +provides an improvement over the state of the art for the \emph{fully dynamic +subgraph connectivity problem} in the \emph{sensitivity setting} +[Henzinger--Neumann ESA'16]. + We argue that the preprocessing time and query time of our data structure are +conditionally optimal under standard fine-grained complexity assumptions. + +
+
+
+
+
+ + ♻ ☆ Efficient Estimation for Longitudinal Networks via Adaptive Merging + + +
+ Longitudinal network consists of a sequence of temporal edges among multiple +nodes, where the temporal edges are observed in real time. It has become +ubiquitous with the rise of online social platform and e-commerce, but largely +under-investigated in literature. In this paper, we propose an efficient +estimation framework for longitudinal network, leveraging strengths of adaptive +network merging, tensor decomposition and point process. It merges neighboring +sparse networks so as to enlarge the number of observed edges and reduce +estimation variance, whereas the estimation bias introduced by network merging +is controlled by exploiting local temporal structures for adaptive network +neighborhood. A projected gradient descent algorithm is proposed to facilitate +estimation, where the upper bound of the estimation error in each iteration is +established. A thorough analysis is conducted to quantify the asymptotic +behavior of the proposed method, which shows that it can significantly reduce +the estimation error and also provides guideline for network merging under +various scenarios. We further demonstrate the advantage of the proposed method +through extensive numerical experiments on synthetic datasets and a militarized +interstate dispute dataset. + +
+
+ comment: 30 pages and 4 figures; appendix including technical proof will be + uploaded later +
+
+
+
+
+ + ♻ ☆ Model Generation with LLMs: From Requirements to UML Sequence Diagrams + + +
+ Complementing natural language (NL) requirements with graphical models can +improve stakeholders' communication and provide directions for system design. +However, creating models from requirements involves manual effort. The advent +of generative large language models (LLMs), ChatGPT being a notable example, +offers promising avenues for automated assistance in model generation. This +paper investigates the capability of ChatGPT to generate a specific type of +model, i.e., UML sequence diagrams, from NL requirements. We conduct a +qualitative study in which we examine the sequence diagrams generated by +ChatGPT for 28 requirements documents of various types and from different +domains. Observations from the analysis of the generated diagrams have +systematically been captured through evaluation logs, and categorized through +thematic analysis. Our results indicate that, although the models generally +conform to the standard and exhibit a reasonable level of understandability, +their completeness and correctness with respect to the specified requirements +often present challenges. This issue is particularly pronounced in the presence +of requirements smells, such as ambiguity and inconsistency. The insights +derived from this study can influence the practical utilization of LLMs in the +RE process, and open the door to novel RE-specific prompting strategies +targeting effective model generation. + +
+
+
+
+
+ + ♻ ☆ Recovering the Pre-Fine-Tuning Weights of Generative Models ICML 2024 + + +
+ The dominant paradigm in generative modeling consists of two steps: i) +pre-training on a large-scale but unsafe dataset, ii) aligning the pre-trained +model with human values via fine-tuning. This practice is considered safe, as +no current method can recover the unsafe, pre-fine-tuning model weights. In +this paper, we demonstrate that this assumption is often false. Concretely, we +present Spectral DeTuning, a method that can recover the weights of the +pre-fine-tuning model using a few low-rank (LoRA) fine-tuned models. In +contrast to previous attacks that attempt to recover pre-fine-tuning +capabilities, our method aims to recover the exact pre-fine-tuning weights. Our +approach exploits this new vulnerability against large-scale models such as a +personalized Stable Diffusion and an aligned Mistral. + +
+
+ comment: ICML 2024. Project page: https://vision.huji.ac.il/spectral_detuning/ +
+
+
+
+
+ + ♻ ☆ Probabilistic Test-Time Generalization by Variational Neighbor-Labeling + + +
+ This paper strives for domain generalization, where models are trained +exclusively on source domains before being deployed on unseen target domains. +We follow the strict separation of source training and target testing, but +exploit the value of the unlabeled target data itself during inference. We make +three contributions. First, we propose probabilistic pseudo-labeling of target +samples to generalize the source-trained model to the target domain at test +time. We formulate the generalization at test time as a variational inference +problem, by modeling pseudo labels as distributions, to consider the +uncertainty during generalization and alleviate the misleading signal of +inaccurate pseudo labels. Second, we learn variational neighbor labels that +incorporate the information of neighboring target samples to generate more +robust pseudo labels. Third, to learn the ability to incorporate more +representative target information and generate more precise and robust +variational neighbor labels, we introduce a meta-generalization stage during +training to simulate the generalization procedure. Experiments on seven +widely-used datasets demonstrate the benefits, abilities, and effectiveness of +our proposal. + +
+
+ comment: Accepted by CoLLAs 2024 +
+
+
+
+
+ + ♻ ☆ Model Internals-based Answer Attribution for Trustworthy + Retrieval-Augmented Generation + + +
+ Ensuring the verifiability of model answers is a fundamental challenge for +retrieval-augmented generation (RAG) in the question answering (QA) domain. +Recently, self-citation prompting was proposed to make large language models +(LLMs) generate citations to supporting documents along with their answers. +However, self-citing LLMs often struggle to match the required format, refer to +non-existent sources, and fail to faithfully reflect LLMs' context usage +throughout the generation. In this work, we present MIRAGE --Model +Internals-based RAG Explanations -- a plug-and-play approach using model +internals for faithful answer attribution in RAG applications. MIRAGE detects +context-sensitive answer tokens and pairs them with retrieved documents +contributing to their prediction via saliency methods. We evaluate our proposed +approach on a multilingual extractive QA dataset, finding high agreement with +human answer attribution. On open-ended QA, MIRAGE achieves citation quality +and efficiency comparable to self-citation while also allowing for a +finer-grained control of attribution parameters. Our qualitative evaluation +highlights the faithfulness of MIRAGE's attributions and underscores the +promising application of model internals for RAG answer attribution. + +
+
+ comment: Under review. Code and data released at + https://github.com/Betswish/MIRAGE +
+
+
+
+
+ + ♻ ☆ Bayesian Regression Markets + + +
+ Although machine learning tasks are highly sensitive to the quality of input +data, relevant datasets can often be challenging for firms to acquire, +especially when held privately by a variety of owners. For instance, if these +owners are competitors in a downstream market, they may be reluctant to share +information. Focusing on supervised learning for regression tasks, we develop a +regression market to provide a monetary incentive for data sharing. Our +mechanism adopts a Bayesian framework, allowing us to consider a more general +class of regression tasks. We present a thorough exploration of the market +properties, and show that similar proposals in literature expose the market +agents to sizeable financial risks, which can be mitigated in our setup. + +
+
+ comment: 35 pages, 11 figures, 3 tables. Published in Journal of Machine + Learning Research (2024) +
+
+
+
+
+ + ♻ ☆ In-Context Reinforcement Learning for Variable Action Spaces ICML 2024 + + +
+ Recently, it has been shown that transformers pre-trained on diverse datasets +with multi-episode contexts can generalize to new reinforcement learning tasks +in-context. A key limitation of previously proposed models is their reliance on +a predefined action space size and structure. The introduction of a new action +space often requires data re-collection and model re-training, which can be +costly for some applications. In our work, we show that it is possible to +mitigate this issue by proposing the Headless-AD model that, despite being +trained only once, is capable of generalizing to discrete action spaces of +variable size, semantic content and order. By experimenting with Bernoulli and +contextual bandits, as well as a gridworld environment, we show that +Headless-AD exhibits significant capability to generalize to action spaces it +has never encountered, even outperforming specialized models trained for a +specific set of actions on several environment configurations. Implementation +is available at: https://github.com/corl-team/headless-ad. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ A Policy Gradient Primal-Dual Algorithm for Constrained MDPs with + Uniform PAC Guarantees + + +
+ We study a primal-dual (PD) reinforcement learning (RL) algorithm for online +constrained Markov decision processes (CMDPs). Despite its widespread practical +use, the existing theoretical literature on PD-RL algorithms for this problem +only provides sublinear regret guarantees and fails to ensure convergence to +optimal policies. In this paper, we introduce a novel policy gradient PD +algorithm with uniform probably approximate correctness (Uniform-PAC) +guarantees, simultaneously ensuring convergence to optimal policies, sublinear +regret, and polynomial sample complexity for any target accuracy. Notably, this +represents the first Uniform-PAC algorithm for the online CMDP problem. In +addition to the theoretical guarantees, we empirically demonstrate in a simple +CMDP that our algorithm converges to optimal policies, while baseline +algorithms exhibit oscillatory performance and constraint violation. + +
+
+
+
+
+ + ♻ ☆ Robust Low-Cost Drone Detection and Classification in Low SNR + Environments + + +
+ The proliferation of drones, or unmanned aerial vehicles (UAVs), has raised +significant safety concerns due to their potential misuse in activities such as +espionage, smuggling, and infrastructure disruption. This paper addresses the +critical need for effective drone detection and classification systems that +operate independently of UAV cooperation. We evaluate various convolutional +neural networks (CNNs) for their ability to detect and classify drones using +spectrogram data derived from consecutive Fourier transforms of signal +components. The focus is on model robustness in low signal-to-noise ratio (SNR) +environments, which is critical for real-world applications. A comprehensive +dataset is provided to support future model development. In addition, we +demonstrate a low-cost drone detection system using a standard computer, +software-defined radio (SDR) and antenna, validated through real-world field +testing. On our development dataset, all models consistently achieved an +average balanced classification accuracy of >= 85% at SNR > -12dB. In the field +test, these models achieved an average balance accuracy of > 80%, depending on +transmitter distance and antenna direction. Our contributions include: a +publicly available dataset for model development, a comparative analysis of CNN +for drone detection under low SNR conditions, and the deployment and field +evaluation of a practical, low-cost detection system. + +
+
+ comment: 10 pages, submitted to IEEE Journal of Radio Frequency Identification +
+
+
+
+
+ + ♻ ☆ AdaCL:Adaptive Continual Learning + + +
+ Class-Incremental Learning aims to update a deep classifier to learn new +categories while maintaining or improving its accuracy on previously observed +classes. Common methods to prevent forgetting previously learned classes +include regularizing the neural network updates and storing exemplars in +memory, which come with hyperparameters such as the learning rate, +regularization strength, or the number of exemplars. However, these +hyperparameters are usually only tuned at the start and then kept fixed +throughout the learning sessions, ignoring the fact that newly encountered +tasks may have varying levels of novelty or difficulty. This study investigates +the necessity of hyperparameter `adaptivity' in Class-Incremental Learning: the +ability to dynamically adjust hyperparameters such as the learning rate, +regularization strength, and memory size according to the properties of the new +task at hand. We propose AdaCL, a Bayesian Optimization-based approach to +automatically and efficiently determine the optimal values for those parameters +with each learning task. We show that adapting hyperpararmeters on each new +task leads to improvement in accuracy, forgetting and memory. Code is available +at https://github.com/ElifCerenGokYildirim/AdaCL. + +
+
+ comment: Published in 1st ContinualAI Unconference +
+
+
+
+
+ + ♻ ☆ Energy-based Epistemic Uncertainty for Graph Neural Networks + + +
+ In domains with interdependent data, such as graphs, quantifying the +epistemic uncertainty of a Graph Neural Network (GNN) is challenging as +uncertainty can arise at different structural scales. Existing techniques +neglect this issue or only distinguish between structure-aware and +structure-agnostic uncertainty without combining them into a single measure. We +propose GEBM, an energy-based model (EBM) that provides high-quality +uncertainty estimates by aggregating energy at different structural levels that +naturally arise from graph diffusion. In contrast to logit-based EBMs, we +provably induce an integrable density in the data space by regularizing the +energy function. We introduce an evidential interpretation of our EBM that +significantly improves the predictive robustness of the GNN. Our framework is a +simple and effective post hoc method applicable to any pre-trained GNN that is +sensitive to various distribution shifts. It consistently achieves the best +separation of in-distribution and out-of-distribution data on 6 out of 7 +anomaly types while having the best average rank over shifts on \emph{all} +datasets. + +
+
+
+
+
+ + ♻ ☆ Minimax Excess Risk of First-Order Methods for Statistical Learning with + Data-Dependent Oracles + + +
+ In this paper, our aim is to analyse the generalization capabilities of +first-order methods for statistical learning in multiple, different yet +related, scenarios including supervised learning, transfer learning, robust +learning and federated learning. To do so, we provide sharp upper and lower +bounds for the minimax excess risk of strongly convex and smooth statistical +learning when the gradient is accessed through partial observations given by a +data-dependent oracle. This novel class of oracles can query the gradient with +any given data distribution, and is thus well suited to scenarios in which the +training data distribution does not match the target (or test) distribution. In +particular, our upper and lower bounds are proportional to the smallest mean +square error achievable by gradient estimators, thus allowing us to easily +derive multiple sharp bounds in the aforementioned scenarios using the +extensive literature on parameter estimation. + +
+
+ comment: 22 pages, 0 figures +
+
+
+
+
+ + ♻ ☆ Fast and Efficient 2-bit LLM Inference on GPU: 2/4/16-bit in a Weight + Matrix with Asynchronous Dequantization + + +
+ Large language models (LLMs) have demonstrated impressive abilities in +various domains while the inference cost is expensive. Many previous studies +exploit quantization methods to reduce LLM inference cost by reducing latency +and memory consumption. Applying 2-bit single-precision weight quantization +brings >3% accuracy loss, so the state-of-the-art methods use mixed-precision +methods for LLMs (e.g. Llama2-7b, etc.) to improve the accuracy. However, +challenges still exist: (1) Uneven distribution in weight matrix. (2) Large +speed degradation by adding sparse outliers. (3) Time-consuming dequantization +operations on GPUs. To tackle these challenges and enable fast and efficient +LLM inference on GPUs, we propose the following techniques in this paper. (1) +Intra-weight mixed-precision quantization. (2) Exclusive 2-bit sparse outlier +with minimum speed degradation. (3) Asynchronous dequantization. We conduct +extensive experiments on different model families (e.g. Llama3, etc.) and model +sizes. We achieve 2.91-bit for each weight considering all scales/zeros for +different models with negligible loss. As a result, with our 2/4/16 +mixed-precision quantization for each weight matrix and asynchronous +dequantization during inference, our design achieves an end-to-end speedup for +Llama2-7b is 1.74x over the original model, and we reduce both runtime cost and +total cost by up to 2.53x and 2.29x with less GPU requirements. + +
+
+
+
+
+ + ♻ ☆ Remote sensing framework for geological mapping via stacked autoencoders + and clustering + + +
+ Supervised machine learning methods for geological mapping via remote sensing +face limitations due to the scarcity of accurately labelled training data that +can be addressed by unsupervised learning, such as dimensionality reduction and +clustering. Dimensionality reduction methods have the potential to play a +crucial role in improving the accuracy of geological maps. Although +conventional dimensionality reduction methods may struggle with nonlinear data, +unsupervised deep learning models such as autoencoders can model non-linear +relationships. Stacked autoencoders feature multiple interconnected layers to +capture hierarchical data representations useful for remote sensing data. This +study presents an unsupervised machine learning-based framework for processing +remote sensing data using stacked autoencoders for dimensionality reduction and +k-means clustering for mapping geological units. We use Landsat 8, ASTER, and +Sentinel-2 datasets to evaluate the framework for geological mapping of the +Mutawintji region in Western New South Wales, Australia. We also compare +stacked autoencoders with principal component analysis and canonical +autoencoders. Our results reveal that the framework produces accurate and +interpretable geological maps, efficiently discriminating rock units. We find +that the accuracy of stacked autoencoders ranges from 86.6 % to 90 %, depending +on the remote sensing data type, which is superior to their counterparts. We +also find that the generated maps align with prior geological knowledge of the +study area while providing novel insights into geological structures. + +
+
+
+
+
+ + ♻ ☆ A General Verification Framework for Dynamical and Control Models via + Certificate Synthesis + + +
+ An emerging branch of control theory specialises in certificate learning, +concerning the specification of a desired (possibly complex) system behaviour +for an autonomous or control model, which is then analytically verified by +means of a function-based proof. However, the synthesis of controllers abiding +by these complex requirements is in general a non-trivial task and may elude +the most expert control engineers. This results in a need for automatic +techniques that are able to design controllers and to analyse a wide range of +elaborate specifications. In this paper, we provide a general framework to +encode system specifications and define corresponding certificates, and we +present an automated approach to formally synthesise controllers and +certificates. Our approach contributes to the broad field of safe learning for +control, exploiting the flexibility of neural networks to provide candidate +control and certificate functions, whilst using SMT-solvers to offer a formal +guarantee of correctness. We test our framework by developing a prototype +software tool, and assess its efficacy at verification via control and +certificate synthesis over a large and varied suite of benchmarks. + +
+
+
+
+
+ + ♻ ☆ Contextualized Hybrid Ensemble Q-learning: Learning Fast with Control + Priors + + +
+ Combining Reinforcement Learning (RL) with a prior controller can yield the +best out of two worlds: RL can solve complex nonlinear problems, while the +control prior ensures safer exploration and speeds up training. Prior work +largely blends both components with a fixed weight, neglecting that the RL +agent's performance varies with the training progress and across regions in the +state space. Therefore, we advocate for an adaptive strategy that dynamically +adjusts the weighting based on the RL agent's current capabilities. We propose +a new adaptive hybrid RL algorithm, Contextualized Hybrid Ensemble Q-learning +(CHEQ). CHEQ combines three key ingredients: (i) a time-invariant formulation +of the adaptive hybrid RL problem treating the adaptive weight as a context +variable, (ii) a weight adaption mechanism based on the parametric uncertainty +of a critic ensemble, and (iii) ensemble-based acceleration for data-efficient +RL. Evaluating CHEQ on a car racing task reveals substantially stronger data +efficiency, exploration safety, and transferability to unknown scenarios than +state-of-the-art adaptive hybrid RL methods. + +
+
+
+
+
+ + ♻ ☆ Explaining the Explainers in Graph Neural Networks: a Comparative Study + + +
+ Following a fast initial breakthrough in graph based learning, Graph Neural +Networks (GNNs) have reached a widespread application in many science and +engineering fields, prompting the need for methods to understand their decision +process. + GNN explainers have started to emerge in recent years, with a multitude of +methods both novel or adapted from other domains. To sort out this plethora of +alternative approaches, several studies have benchmarked the performance of +different explainers in terms of various explainability metrics. However, these +earlier works make no attempts at providing insights into why different GNN +architectures are more or less explainable, or which explainer should be +preferred in a given setting. + In this survey, we fill these gaps by devising a systematic experimental +study, which tests ten explainers on eight representative architectures trained +on six carefully designed graph and node classification datasets. With our +results we provide key insights on the choice and applicability of GNN +explainers, we isolate key components that make them usable and successful and +provide recommendations on how to avoid common interpretation pitfalls. We +conclude by highlighting open questions and directions of possible future +research. + +
+
+
+
+
+ + ♻ ☆ Mimicking User Data: On Mitigating Fine-Tuning Risks in Closed Large + Language Models + + +
+ Fine-tuning large language models on small, high-quality datasets can enhance +their performance on specific downstream tasks. Recent research shows that +fine-tuning on benign, instruction-following data can inadvertently undo the +safety alignment process and increase a model's propensity to comply with +harmful queries. Although critical, understanding and mitigating safety risks +in well-defined tasks remains distinct from the instruction-following context +due to structural differences in the data. Our work addresses the gap in our +understanding of these risks across diverse types of data in closed models - +where providers control how user data is utilized in the fine-tuning process. +We demonstrate how malicious actors can subtly manipulate the structure of +almost any task-specific dataset to foster significantly more dangerous model +behaviors, while maintaining an appearance of innocuity and reasonable +downstream task performance. To address this issue, we propose a novel +mitigation strategy that mixes in safety data which mimics the task format and +prompting style of the user data, showing this is more effective than existing +baselines at re-establishing safety alignment while maintaining similar task +performance. + +
+
+
+
+
+ + ♻ ☆ Training-Free Acceleration of ViTs with Delayed Spatial Merging ICML 2024 + + +
+ Token merging has emerged as a new paradigm that can accelerate the inference +of Vision Transformers (ViTs) without any retraining or fine-tuning. To push +the frontier of training-free acceleration in ViTs, we improve token merging by +adding the perspectives of 1) activation outliers and 2) hierarchical +representations. Through a careful analysis of the attention behavior in ViTs, +we characterize a delayed onset of the convergent attention phenomenon, which +makes token merging undesirable in the bottom blocks of ViTs. Moreover, we +augment token merging with a hierarchical processing scheme to capture +multi-scale redundancy between visual tokens. Combining these two insights, we +build a unified inference framework called DSM: Delayed Spatial Merging. We +extensively evaluate DSM on various ViT model scales (Tiny to Huge) and tasks +(ImageNet-1k and transfer learning), achieving up to 1.8$\times$ FLOP reduction +and 1.6$\times$ throughput speedup at a negligible loss while being two orders +of magnitude faster than existing methods. + +
+
+ comment: ICML 2024 ES-FoMo Workshop +
+
+
+
+
+ + ♻ ☆ CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay ICML'24 + + +
+ Large language models are increasingly solving tasks that are commonly +believed to require human-level reasoning ability. However, these models still +perform very poorly on benchmarks of general intelligence such as the +Abstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a +programming-by-examples problem, and introduce a novel and scalable method for +language model self-improvement called Code Iteration (CodeIt). Our method +iterates between 1) program sampling and hindsight relabeling, and 2) learning +from prioritized experience replay. By relabeling the goal of an episode (i.e., +the target program output given input) to the realized output produced by the +sampled program, our method effectively deals with the extreme sparsity of +rewards in program synthesis. Applying CodeIt to the ARC dataset, we +demonstrate that prioritized hindsight replay, along with pre-training and +data-augmentation, leads to successful inter-task generalization. CodeIt is the +first neuro-symbolic approach that scales to the full ARC evaluation dataset. +Our method solves 15% of ARC evaluation tasks, achieving state-of-the-art +performance and outperforming existing neural and symbolic baselines. Our code +is available at https://github.com/Qualcomm-AI-research/codeit . + +
+
+ comment: ICML'24 camera-ready version +
+
+
+
+
+ + ♻ ☆ CoCoST: Automatic Complex Code Generation with Online Searching and + Correctness Testing + + +
+ Large Language Models have revolutionized code generation ability by +converting natural language descriptions into executable code. However, +generating complex code within real-world scenarios remains challenging due to +intricate structures, subtle bugs, understanding of advanced data types, and +lack of supplementary contents. To address these challenges, we introduce the +CoCoST framework, which enhances complex code generation by online searching +for more information with planned queries and correctness testing for code +refinement. Moreover, CoCoST serializes the complex inputs and outputs to +improve comprehension and generates test cases to ensure the adaptability for +real-world applications. CoCoST is validated through rigorous experiments on +the DS-1000 and ClassEval datasets. Experimental results show that CoCoST +substantially improves the quality of complex code generation, highlighting its +potential to enhance the practicality of LLMs in generating complex code. + +
+
+
+
+
+ + ♻ ☆ Multimodal Learning With Intraoperative CBCT & Variably Aligned + Preoperative CT Data To Improve Segmentation MICCAI + + +
+ Cone-beam computed tomography (CBCT) is an important tool facilitating +computer aided interventions, despite often suffering from artifacts that pose +challenges for accurate interpretation. While the degraded image quality can +affect downstream segmentation, the availability of high quality, preoperative +scans represents potential for improvements. Here we consider a setting where +preoperative CT and intraoperative CBCT scans are available, however, the +alignment (registration) between the scans is imperfect. We propose a +multimodal learning method that fuses roughly aligned CBCT and CT scans and +investigate the effect of CBCT quality and misalignment on the final +segmentation performance. For that purpose, we make use of a synthetically +generated data set containing real CT and synthetic CBCT volumes. As an +application scenario, we focus on liver and liver tumor segmentation. We show +that the fusion of preoperative CT and simulated, intraoperative CBCT mostly +improves segmentation performance (compared to using intraoperative CBCT only) +and that even clearly misaligned preoperative data has the potential to improve +segmentation performance. + +
+
+ comment: Submitted to SASHIMI2024 (MICCAI workshop) +
+
+
+
+
+ + ♻ ☆ $σ$-PCA: a building block for neural learning of identifiable + linear transformations + + +
+ Linear principal component analysis (PCA) learns (semi-)orthogonal +transformations by orienting the axes to maximize variance. Consequently, it +can only identify orthogonal axes whose variances are clearly distinct, but it +cannot identify the subsets of axes whose variances are roughly equal. It +cannot eliminate the subspace rotational indeterminacy: it fails to disentangle +components with equal variances (eigenvalues), resulting, in each eigen +subspace, in randomly rotated axes. In this paper, we propose $\sigma$-PCA, a +method that (1) formulates a unified model for linear and nonlinear PCA, the +latter being a special case of linear independent component analysis (ICA), and +(2) introduces a missing piece into nonlinear PCA that allows it to eliminate, +from the canonical linear PCA solution, the subspace rotational indeterminacy +-- without whitening the inputs. Whitening, a preprocessing step which converts +the inputs into unit-variance inputs, has generally been a prerequisite step +for linear ICA methods, which meant that conventional nonlinear PCA could not +necessarily preserve the orthogonality of the overall transformation, could not +directly reduce dimensionality, and could not intrinsically order by variances. +We offer insights on the relationship between linear PCA, nonlinear PCA, and +linear ICA -- three methods with autoencoder formulations for learning special +linear transformations from data, transformations that are (semi-)orthogonal +for PCA, and arbitrary unit-variance for ICA. As part of our formulation, +nonlinear PCA can be seen as a method that maximizes both variance and +statistical independence, lying in the middle between linear PCA and linear +ICA, serving as a building block for learning linear transformations that are +identifiable. + +
+
+ comment: Update with published version +
+
+
+
+
+ + ♻ ☆ Generalization Error of Graph Neural Networks in the Mean-field Regime ICML 2024 + + +
+ This work provides a theoretical framework for assessing the generalization +error of graph neural networks in the over-parameterized regime, where the +number of parameters surpasses the quantity of data points. We explore two +widely utilized types of graph neural networks: graph convolutional neural +networks and message passing graph neural networks. Prior to this study, +existing bounds on the generalization error in the over-parametrized regime +were uninformative, limiting our understanding of over-parameterized network +performance. Our novel approach involves deriving upper bounds within the +mean-field regime for evaluating the generalization error of these graph neural +networks. We establish upper bounds with a convergence rate of $O(1/n)$, where +$n$ is the number of graph samples. These upper bounds offer a theoretical +assurance of the networks' performance on unseen data in the challenging +over-parameterized regime and overall contribute to our understanding of their +performance. + +
+
+ comment: Accepted in ICML 2024 +
+
+
+
+
+ + ♻ ☆ Uni-Mol2: Exploring Molecular Pretraining Model at Scale + + +
+ In recent years, pretraining models have made significant advancements in the +fields of natural language processing (NLP), computer vision (CV), and life +sciences. The significant advancements in NLP and CV are predominantly driven +by the expansion of model parameters and data size, a phenomenon now recognized +as the scaling laws. However, research exploring scaling law in molecular +pretraining models remains unexplored. In this work, we present Uni-Mol2 , an +innovative molecular pretraining model that leverages a two-track transformer +to effectively integrate features at the atomic level, graph level, and +geometry structure level. Along with this, we systematically investigate the +scaling law within molecular pretraining models, characterizing the power-law +correlations between validation loss and model size, dataset size, and +computational resources. Consequently, we successfully scale Uni-Mol2 to 1.1 +billion parameters through pretraining on 800 million conformations, making it +the largest molecular pretraining model to date. Extensive experiments show +consistent improvement in the downstream tasks as the model size grows. The +Uni-Mol2 with 1.1B parameters also outperforms existing methods, achieving an +average 27% improvement on the QM9 and 14% on COMPAS-1D dataset. + +
+
+
+
+
+ + ♻ ☆ Long-term drought prediction using deep neural networks based on + geospatial weather data + + +
+ The problem of high-quality drought forecasting up to a year in advance is +critical for agriculture planning and insurance. Yet, it is still unsolved with +reasonable accuracy due to data complexity and aridity stochasticity. We tackle +drought data by introducing an end-to-end approach that adopts a +spatio-temporal neural network model with accessible open monthly climate data +as the input. + Our systematic research employs diverse proposed models and five distinct +environmental regions as a testbed to evaluate the efficacy of the Palmer +Drought Severity Index (PDSI) prediction. Key aggregated findings are the +exceptional performance of a Transformer model, EarthFormer, in making accurate +short-term (up to six months) forecasts. At the same time, the Convolutional +LSTM excels in longer-term forecasting. Both models achieved high ROC AUC +scores: 0.948 for one month ahead and 0.617 for twelve months ahead forecasts, +becoming closer to perfect ROC-AUC by $54\%$ and $16\%$, respectively, c.t. +classic approaches. + +
+
+
+
+
+ + ♻ ☆ Total Variation Distance Meets Probabilistic Inference ICML + + +
+ In this paper, we establish a novel connection between total variation (TV) +distance estimation and probabilistic inference. In particular, we present an +efficient, structure-preserving reduction from relative approximation of TV +distance to probabilistic inference over directed graphical models. This +reduction leads to a fully polynomial randomized approximation scheme (FPRAS) +for estimating TV distances between same-structure distributions over any class +of Bayes nets for which there is an efficient probabilistic inference +algorithm. In particular, it leads to an FPRAS for estimating TV distances +between distributions that are defined over a common Bayes net of small +treewidth. Prior to this work, such approximation schemes only existed for +estimating TV distances between product distributions. Our approach employs a +new notion of $partial$ couplings of high-dimensional distributions, which +might be of independent interest. + +
+
+ comment: 25 pages. This work has been accepted for presentation at the + International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ♻ ☆ A Deep Learning Approach for Overall Survival Prediction in Lung Cancer + with Missing Values + + +
+ In the field of lung cancer research, particularly in the analysis of overall +survival (OS), artificial intelligence (AI) serves crucial roles with specific +aims. Given the prevalent issue of missing data in the medical domain, our +primary objective is to develop an AI model capable of dynamically handling +this missing data. Additionally, we aim to leverage all accessible data, +effectively analyzing both uncensored patients who have experienced the event +of interest and censored patients who have not, by embedding a specialized +technique within our AI model, not commonly utilized in other AI tasks. Through +the realization of these objectives, our model aims to provide precise OS +predictions for non-small cell lung cancer (NSCLC) patients, thus overcoming +these significant challenges. We present a novel approach to survival analysis +with missing values in the context of NSCLC, which exploits the strengths of +the transformer architecture to account only for available features without +requiring any imputation strategy. More specifically, this model tailors the +transformer architecture to tabular data by adapting its feature embedding and +masked self-attention to mask missing data and fully exploit the available +ones. By making use of ad-hoc designed losses for OS, it is able to account for +both censored and uncensored patients, as well as changes in risks over time. +We compared our method with state-of-the-art models for survival analysis +coupled with different imputation strategies. We evaluated the results obtained +over a period of 6 years using different time granularities obtaining a +Ct-index, a time-dependent variant of the C-index, of 71.97, 77.58 and 80.72 +for time units of 1 month, 1 year and 2 years, respectively, outperforming all +state-of-the-art methods regardless of the imputation method used. + +
+
+ comment: 24 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Model Compression Method for S4 with Diagonal State Space Layers using + Balanced Truncation + + +
+ To implement deep learning models on edge devices, model compression methods +have been widely recognized as useful. However, it remains unclear which model +compression methods are effective for Structured State Space Sequence (S4) +models incorporating Diagonal State Space (DSS) layers, tailored for processing +long-sequence data. In this paper, we propose to use the balanced truncation, a +prevalent model reduction technique in control theory, applied specifically to +DSS layers in pre-trained S4 model as a novel model compression method. +Moreover, we propose using the reduced model parameters obtained by the +balanced truncation as initial parameters of S4 models with DSS layers during +the main training process. Numerical experiments demonstrate that our trained +models combined with the balanced truncation surpass conventionally trained +models with Skew-HiPPO initialization in accuracy, even with fewer parameters. +Furthermore, our observations reveal a positive correlation: higher accuracy in +the original model consistently leads to increased accuracy in models trained +using our model compression method, suggesting that our approach effectively +leverages the strengths of the original model. + +
+
+
+
+
+ + ♻ ☆ Training morphological neural networks with gradient descent: some + theoretical insights + + +
+ Morphological neural networks, or layers, can be a powerful tool to boost the +progress in mathematical morphology, either on theoretical aspects such as the +representation of complete lattice operators, or in the development of image +processing pipelines. However, these architectures turn out to be difficult to +train when they count more than a few morphological layers, at least within +popular machine learning frameworks which use gradient descent based +optimization algorithms. In this paper we investigate the potential and +limitations of differentiation based approaches and back-propagation applied to +morphological networks, in light of the non-smooth optimization concept of +Bouligand derivative. We provide insights and first theoretical guidelines, in +particular regarding initialization and learning rates. + +
+
+
+
+
+ + ♻ ☆ Revitalizing Multivariate Time Series Forecasting: Learnable + Decomposition with Inter-Series Dependencies and Intra-Series Variations + Modeling + + +
+ Predicting multivariate time series is crucial, demanding precise modeling of +intricate patterns, including inter-series dependencies and intra-series +variations. Distinctive trend characteristics in each time series pose +challenges, and existing methods, relying on basic moving average kernels, may +struggle with the non-linear structure and complex trends in real-world data. +Given that, we introduce a learnable decomposition strategy to capture dynamic +trend information more reasonably. Additionally, we propose a dual attention +module tailored to capture inter-series dependencies and intra-series +variations simultaneously for better time series forecasting, which is +implemented by channel-wise self-attention and autoregressive self-attention. +To evaluate the effectiveness of our method, we conducted experiments across +eight open-source datasets and compared it with the state-of-the-art methods. +Through the comparison results, our Leddam (LEarnable Decomposition and Dual +Attention Module) not only demonstrates significant advancements in predictive +performance, but also the proposed decomposition strategy can be plugged into +other methods with a large performance-boosting, from 11.87% to 48.56% MSE +error degradation. + +
+
+
+
+
+ + ♻ ☆ KAGNNs: Kolmogorov-Arnold Networks meet Graph Learning + + +
+ In recent years, Graph Neural Networks (GNNs) have become the de facto tool +for learning node and graph representations. Most GNNs typically consist of a +sequence of neighborhood aggregation (a.k.a., message passing) layers. Within +each of these layers, the representation of each node is updated from an +aggregation and transformation of its neighbours representations at the +previous layer. The upper bound for the expressive power of message passing +GNNs was reached through the use of MLPs as a transformation, due to their +universal approximation capabilities. However, MLPs suffer from well-known +limitations, which recently motivated the introduction of Kolmogorov-Arnold +Networks (KANs). KANs rely on the Kolmogorov-Arnold representation theorem, +rendering them a promising alternative to MLPs. In this work, we compare the +performance of KANs against that of MLPs in graph learning tasks. We perform +extensive experiments on node classification, graph classification and graph +regression datasets. Our preliminary results indicate that while KANs are +on-par with MLPs in classification tasks, they seem to have a clear advantage +in the graph regression tasks. Code is available at https: +//github.com/RomanBresson/KAGNN. + +
+
+
+
+
+ + ♻ ☆ Evaluating Copyright Takedown Methods for Language Models + + +
+ Language models (LMs) derive their capabilities from extensive training on +diverse data, including potentially copyrighted material. These models can +memorize and generate content similar to their training data, posing potential +concerns. Therefore, model creators are motivated to develop mitigation methods +that prevent generating protected content. We term this procedure as copyright +takedowns for LMs, noting the conceptual similarity to (but legal distinction +from) the DMCA takedown This paper introduces the first evaluation of the +feasibility and side effects of copyright takedowns for LMs. We propose +CoTaEval, an evaluation framework to assess the effectiveness of copyright +takedown methods, the impact on the model's ability to retain uncopyrightable +factual knowledge from the training data whose recitation is embargoed, and how +well the model maintains its general utility and efficiency. We examine several +strategies, including adding system prompts, decoding-time filtering +interventions, and unlearning approaches. Our findings indicate that no tested +method excels across all metrics, showing significant room for research in this +unique problem setting and indicating potential unresolved challenges for live +policy proposals. + +
+
+ comment: 31 pages, 9 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Assessing the Brittleness of Safety Alignment via Pruning and Low-Rank + Modifications + + +
+ Large language models (LLMs) show inherent brittleness in their safety +mechanisms, as evidenced by their susceptibility to jailbreaking and even +non-malicious fine-tuning. This study explores this brittleness of safety +alignment by leveraging pruning and low-rank modifications. We develop methods +to identify critical regions that are vital for safety guardrails, and that are +disentangled from utility-relevant regions at both the neuron and rank levels. +Surprisingly, the isolated regions we find are sparse, comprising about $3\%$ +at the parameter level and $2.5\%$ at the rank level. Removing these regions +compromises safety without significantly impacting utility, corroborating the +inherent brittleness of the model's safety mechanisms. Moreover, we show that +LLMs remain vulnerable to low-cost fine-tuning attacks even when modifications +to the safety-critical regions are restricted. These findings underscore the +urgent need for more robust safety strategies in LLMs. + +
+
+ comment: 22 pages, 9 figures. Project page is available at + https://boyiwei.com/alignment-attribution/ +
+
+
+
+
+ + ♻ ☆ σ-GPTs: A New Approach to Autoregressive Models ECML + + +
+ Autoregressive models, such as the GPT family, use a fixed order, usually +left-to-right, to generate sequences. However, this is not a necessity. In this +paper, we challenge this assumption and show that by simply adding a positional +encoding for the output, this order can be modulated on-the-fly per-sample +which offers key advantageous properties. It allows for the sampling of and +conditioning on arbitrary subsets of tokens, and it also allows sampling in one +shot multiple tokens dynamically according to a rejection strategy, leading to +a sub-linear number of model evaluations. We evaluate our method across various +domains, including language modeling, path-solving, and aircraft vertical rate +prediction, decreasing the number of steps required for generation by an order +of magnitude. + +
+
+ comment: 23 pages, 7 figures, accepted at ECML/PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Classification under Nuisance Parameters and Generalized Label Shift in + Likelihood-Free Inference + + +
+ An open scientific challenge is how to classify events with reliable measures +of uncertainty, when we have a mechanistic model of the data-generating process +but the distribution over both labels and latent nuisance parameters is +different between train and target data. We refer to this type of +distributional shift as generalized label shift (GLS). Direct classification +using observed data $\mathbf{X}$ as covariates leads to biased predictions and +invalid uncertainty estimates of labels $Y$. We overcome these biases by +proposing a new method for robust uncertainty quantification that casts +classification as a hypothesis testing problem under nuisance parameters. The +key idea is to estimate the classifier's receiver operating characteristic +(ROC) across the entire nuisance parameter space, which allows us to devise +cutoffs that are invariant under GLS. Our method effectively endows a +pre-trained classifier with domain adaptation capabilities and returns valid +prediction sets while maintaining high power. We demonstrate its performance on +two challenging scientific problems in biology and astroparticle physics with +data from realistic mechanistic models. + +
+
+ comment: 26 pages, 19 figures, code available at + https://github.com/lee-group-cmu/lf2i +
+
+
+
+
+ + ♻ ☆ RouteLLM: Learning to Route LLMs with Preference Data + + +
+ Large language models (LLMs) exhibit impressive capabilities across a wide +range of tasks, yet the choice of which model to use often involves a trade-off +between performance and cost. More powerful models, though effective, come with +higher expenses, while less capable models are more cost-effective. To address +this dilemma, we propose several efficient router models that dynamically +select between a stronger and a weaker LLM during inference, aiming to optimize +the balance between cost and response quality. We develop a training framework +for these routers leveraging human preference data and data augmentation +techniques to enhance performance. Our evaluation on widely-recognized +benchmarks shows that our approach significantly reduces costs-by over 2 times +in certain cases-without compromising the quality of responses. Interestingly, +our router models also demonstrate significant transfer learning capabilities, +maintaining their performance even when the strong and weak models are changed +at test time. This highlights the potential of these routers to provide a +cost-effective yet high-performance solution for deploying LLMs. + +
+
+
+
+
+ + ♻ ☆ Mutual Information Assisted Ensemble Recommender System for Identifying + Critical Risk Factors in Healthcare Prognosis + + +
+ Purpose: Health recommenders act as important decision support systems, +aiding patients and medical professionals in taking actions that lead to +patients' well-being. These systems extract the information which may be of +particular relevance to the end-user, helping them in making appropriate +decisions. The present study proposes a feature recommender, as a part of a +disease management system, that identifies and recommends the most important +risk factors for an illness. + Methods: A novel mutual information and ensemble-based feature ranking +approach for identifying critical risk factors in healthcare prognosis is +proposed. + Results: To establish the effectiveness of the proposed method, experiments +have been conducted on four benchmark datasets of diverse diseases (clear cell +renal cell carcinoma (ccRCC), chronic kidney disease, Indian liver patient, and +cervical cancer risk factors). The performance of the proposed recommender is +compared with four state-of-the-art methods using recommender systems' +performance metrics like average precision@K, precision@K, recall@K, F1@K, +reciprocal rank@K. The method is able to recommend all relevant critical risk +factors for ccRCC. It also attains a higher accuracy (96.6% and 98.6% using +support vector machine and neural network, respectively) for ccRCC staging with +a reduced feature set as compared to existing methods. Moreover, the top two +features recommended using the proposed method with ccRCC, viz. size of tumor +and metastasis status, are medically validated from the existing TNM system. +Results are also found to be superior for the other three datasets. + Conclusion: The proposed recommender can identify and recommend risk factors +that have the most discriminating power for detecting diseases. + +
+
+
+
+
+ + ♻ ☆ Efficient and Flexible Method for Reducing Moderate-size Deep Neural + Networks with Condensation + + +
+ Neural networks have been extensively applied to a variety of tasks, +achieving astounding results. Applying neural networks in the scientific field +is an important research direction that is gaining increasing attention. In +scientific applications, the scale of neural networks is generally +moderate-size, mainly to ensure the speed of inference during application. +Additionally, comparing neural networks to traditional algorithms in scientific +applications is inevitable. These applications often require rapid +computations, making the reduction of neural network sizes increasingly +important. Existing work has found that the powerful capabilities of neural +networks are primarily due to their non-linearity. Theoretical work has +discovered that under strong non-linearity, neurons in the same layer tend to +behave similarly, a phenomenon known as condensation. Condensation offers an +opportunity to reduce the scale of neural networks to a smaller subnetwork with +similar performance. In this article, we propose a condensation reduction +algorithm to verify the feasibility of this idea in practical problems. Our +reduction method can currently be applied to both fully connected networks and +convolutional networks, achieving positive results. In complex combustion +acceleration tasks, we reduced the size of the neural network to 41.7% of its +original scale while maintaining prediction accuracy. In the CIFAR10 image +classification task, we reduced the network size to 11.5% of the original +scale, still maintaining a satisfactory validation accuracy. Our method can be +applied to most trained neural networks, reducing computational pressure and +improving inference speed. + +
+
+
+
+
+ + ♻ ☆ CoMadOut -- A Robust Outlier Detection Algorithm based on CoMAD + + +
+ Unsupervised learning methods are well established in the area of anomaly +detection and achieve state of the art performances on outlier datasets. +Outliers play a significant role, since they bear the potential to distort the +predictions of a machine learning algorithm on a given dataset. Especially +among PCA-based methods, outliers have an additional destructive potential +regarding the result: they may not only distort the orientation and translation +of the principal components, they also make it more complicated to detect +outliers. To address this problem, we propose the robust outlier detection +algorithm CoMadOut, which satisfies two required properties: (1) being robust +towards outliers and (2) detecting them. Our CoMadOut outlier detection +variants using comedian PCA define, dependent on its variant, an inlier region +with a robust noise margin by measures of in-distribution (variant CMO) and +optimized scores by measures of out-of-distribution (variants CMO*), e.g. +kurtosis-weighting by CMO+k. These measures allow distribution based outlier +scoring for each principal component, and thus, an appropriate alignment of the +degree of outlierness between normal and abnormal instances. Experiments +comparing CoMadOut with traditional, deep and other comparable robust outlier +detection methods showed that the performance of the introduced CoMadOut +approach is competitive to well established methods related to average +precision (AP), area under the precision recall curve (AUPRC) and area under +the receiver operating characteristic (AUROC) curve. In summary our approach +can be seen as a robust alternative for outlier detection tasks. + +
+
+ comment: published in Springer Machine Learning Journal (MLJ) +
+
+
+
+
+ + ♻ ☆ ASCENT: Amplifying Power Side-Channel Resilience via Learning & + Monte-Carlo Tree Search + + +
+ Power side-channel (PSC) analysis is pivotal for securing cryptographic +hardware. Prior art focused on securing gate-level netlists obtained as-is from +chip design automation, neglecting all the complexities and potential +side-effects for security arising from the design automation process. That is, +automation traditionally prioritizes power, performance, and area (PPA), +sidelining security. We propose a "security-first" approach, refining the logic +synthesis stage to enhance the overall resilience of PSC countermeasures. We +introduce ASCENT, a learning-and-search-based framework that (i) drastically +reduces the time for post-design PSC evaluation and (ii) explores the +security-vs-PPA design space. Thus, ASCENT enables an efficient exploration of +a large number of candidate netlists, leading to an improvement in PSC +resilience compared to regular PPA-optimized netlists. ASCENT is up to 120x +faster than traditional PSC analysis and yields a 3.11x improvement for PSC +resilience of state-of-the-art PSC countermeasures + +
+
+ comment: Accepted at 2024 ACM/IEEE International Conference on Computer-Aided + Design +
+
+
+
+
+ + ♻ ☆ FunBO: Discovering Acquisition Functions for Bayesian Optimization with + FunSearch + + +
+ The sample efficiency of Bayesian optimization algorithms depends on +carefully crafted acquisition functions (AFs) guiding the sequential collection +of function evaluations. The best-performing AF can vary significantly across +optimization problems, often requiring ad-hoc and problem-specific choices. +This work tackles the challenge of designing novel AFs that perform well across +a variety of experimental settings. Based on FunSearch, a recent work using +Large Language Models (LLMs) for discovery in mathematical sciences, we propose +FunBO, an LLM-based method that can be used to learn new AFs written in +computer code by leveraging access to a limited number of evaluations for a set +of objective functions. We provide the analytic expression of all discovered +AFs and evaluate them on various global optimization benchmarks and +hyperparameter optimization tasks. We show how FunBO identifies AFs that +generalize well in and out of the training distribution of functions, thus +outperforming established general-purpose AFs and achieving competitive +performance against AFs that are customized to specific function types and are +learned via transfer-learning algorithms. + +
+
+
+
+
+ + ♻ ☆ Capacity Provisioning Motivated Online Non-Convex Optimization Problem + with Memory and Switching Cost + + +
+ An online non-convex optimization problem is considered where the goal is to +minimize the flow time (total delay) of a set of jobs by modulating the number +of active servers, but with a switching cost associated with changing the +number of active servers over time. Each job can be processed by at most one +fixed speed server at any time. Compared to the usual online convex +optimization (OCO) problem with switching cost, the objective function +considered is non-convex and more importantly, at each time, it depends on all +past decisions and not just the present one. Both worst-case and stochastic +inputs are considered; for both cases, competitive algorithms are derived. + +
+
+
+
+
+ + ♻ ☆ FAITH: Frequency-domain Attention In Two Horizons for Time Series + Forecasting + + +
+ Time Series Forecasting plays a crucial role in various fields such as +industrial equipment maintenance, meteorology, energy consumption, traffic flow +and financial investment. However, despite their considerable advantages over +traditional statistical approaches, current deep learning-based predictive +models often exhibit a significant deviation between their forecasting outcomes +and the ground truth. This discrepancy is largely due to an insufficient +emphasis on extracting the sequence's latent information, particularly its +global information within the frequency domain and the relationship between +different variables. To address this issue, we propose a novel model +Frequency-domain Attention In Two Horizons, which decomposes time series into +trend and seasonal components using a multi-scale sequence adaptive +decomposition and fusion architecture, and processes them separately. FAITH +utilizes Frequency Channel feature Extraction Module and Frequency Temporal +feature Extraction Module to capture inter-channel relationships and temporal +global information in the sequence, significantly improving its ability to +handle long-term dependencies and complex patterns. Furthermore, FAITH achieves +theoretically linear complexity by modifying the time-frequency domain +transformation method, effectively reducing computational costs. Extensive +experiments on 6 benchmarks for long-term forecasting and 3 benchmarks for +short-term forecasting demonstrate that FAITH outperforms existing models in +many fields, such as electricity, weather and traffic, proving its +effectiveness and superiority both in long-term and short-term time series +forecasting tasks. Our codes and data are available at +https://github.com/LRQ577/FAITH. + +
+
+ comment: We think there are some errors in the experiment result, it may lead + to a wrong conclusion. So we think it will be responsible to withdraw it +
+
+
+
+
+ + ♻ ☆ Backdoor for Debias: Mitigating Model Bias with Backdoor Attack-based + Artificial Bias + + +
+ With the swift advancement of deep learning, state-of-the-art algorithms have +been utilized in various social situations. Nonetheless, some algorithms have +been discovered to exhibit biases and provide unequal results. The current +debiasing methods face challenges such as poor utilization of data or intricate +training requirements. In this work, we found that the backdoor attack can +construct an artificial bias similar to the model bias derived in standard +training. Considering the strong adjustability of backdoor triggers, we are +motivated to mitigate the model bias by carefully designing reverse artificial +bias created from backdoor attack. Based on this, we propose a backdoor +debiasing framework based on knowledge distillation, which effectively reduces +the model bias from original data and minimizes security risks from the +backdoor attack. The proposed solution is validated on both image and +structured datasets, showing promising results. This work advances the +understanding of backdoor attacks and highlights its potential for beneficial +applications. The code for the study can be found at +\url{https://anonymous.4open.science/r/DwB-BC07/}. + +
+
+
+
+
+ + ♻ ☆ Multi-State TD Target for Model-Free Reinforcement Learning + + +
+ Temporal difference (TD) learning is a fundamental technique in reinforcement +learning that updates value estimates for states or state-action pairs using a +TD target. This target represents an improved estimate of the true value by +incorporating both immediate rewards and the estimated value of subsequent +states. Traditionally, TD learning relies on the value of a single subsequent +state. We propose an enhanced multi-state TD (MSTD) target that utilizes the +estimated values of multiple subsequent states. Building on this new MSTD +concept, we develop complete actor-critic algorithms that include management of +replay buffers in two modes, and integrate with deep deterministic policy +optimization (DDPG) and soft actor-critic (SAC). Experimental results +demonstrate that algorithms employing the MSTD target significantly improve +learning performance compared to traditional methods.The code is provided on +GitHub. + +
+
+ comment: 8 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Fast Unsupervised Deep Outlier Model Selection with Hypernetworks + + +
+ Outlier detection (OD) finds many applications with a rich literature of +numerous techniques. Deep neural network based OD (DOD) has seen a recent surge +of attention thanks to the many advances in deep learning. In this paper, we +consider a critical-yet-understudied challenge with unsupervised DOD, that is, +effective hyperparameter (HP) tuning/model selection. While several prior work +report the sensitivity of OD models to HPs, it becomes ever so critical for the +modern DOD models that exhibit a long list of HPs. We introduce HYPER for +tuning DOD models, tackling two fundamental challenges: (1) validation without +supervision (due to lack of labeled anomalies), and (2) efficient search of the +HP/model space (due to exponential growth in the number of HPs). A key idea is +to design and train a novel hypernetwork (HN) that maps HPs onto optimal +weights of the main DOD model. In turn, HYPER capitalizes on a single HN that +can dynamically generate weights for many DOD models (corresponding to varying +HPs), which offers significant speed-up. In addition, it employs meta-learning +on historical OD tasks with labels to train a proxy validation function, +likewise trained with our proposed HN efficiently. Extensive experiments on 35 +OD tasks show that HYPER achieves high performance against 8 baselines with +significant efficiency gains. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ UniST: A Prompt-Empowered Universal Model for Urban Spatio-Temporal + Prediction KDD + + +
+ Urban spatio-temporal prediction is crucial for informed decision-making, +such as traffic management, resource optimization, and emergence response. +Despite remarkable breakthroughs in pretrained natural language models that +enable one model to handle diverse tasks, a universal solution for +spatio-temporal prediction remains challenging Existing prediction approaches +are typically tailored for specific spatio-temporal scenarios, requiring +task-specific model designs and extensive domain-specific training data. In +this study, we introduce UniST, a universal model designed for general urban +spatio-temporal prediction across a wide range of scenarios. Inspired by large +language models, UniST achieves success through: (i) utilizing diverse +spatio-temporal data from different scenarios, (ii) effective pre-training to +capture complex spatio-temporal dynamics, (iii) knowledge-guided prompts to +enhance generalization capabilities. These designs together unlock the +potential of building a universal model for various scenarios Extensive +experiments on more than 20 spatio-temporal scenarios demonstrate UniST's +efficacy in advancing state-of-the-art performance, especially in few-shot and +zero-shot prediction. The datasets and code implementation are released on +https://github.com/tsinghua-fib-lab/UniST. + +
+
+ comment: 2024 ACM SIGKDD International Conference on Knowledge Discovery and + Data Mining, KDD 2024 +
+
+
+
+
+ + ♻ ☆ Cost Aware Best Arm Identification + + +
+ In this paper, we study a best arm identification problem with dual objects. +In addition to the classic reward, each arm is associated with a cost +distribution and the goal is to identify the largest reward arm using the +minimum expected cost. We call it \emph{Cost Aware Best Arm Identification} +(CABAI), which captures the separation of testing and implementation phases in +product development pipelines and models the objective shift between phases, +i.e., cost for testing and reward for implementation. We first derive a +theoretical lower bound for CABAI and propose an algorithm called +$\mathsf{CTAS}$ to match it asymptotically. To reduce the computation of +$\mathsf{CTAS}$, we further propose a simple algorithm called \emph{Chernoff +Overlap} (CO), based on a square-root rule, which we prove is optimal in +simplified two-armed models and generalizes well in numerical experiments. Our +results show that (i) ignoring the heterogeneous action cost results in +sub-optimality in practice, and (ii) simple algorithms can deliver near-optimal +performance over a wide range of problems. + +
+
+
+
+
+ + ♻ ☆ From molecules to scaffolds to functional groups: building + context-dependent molecular representation via multi-channel learning + + +
+ Reliable molecular property prediction is essential for various scientific +endeavors and industrial applications, such as drug discovery. However, the +data scarcity, combined with the highly non-linear causal relationships between +physicochemical and biological properties and conventional molecular +featurization schemes, complicates the development of robust molecular machine +learning models. Self-supervised learning (SSL) has emerged as a popular +solution, utilizing large-scale, unannotated molecular data to learn a +foundational representation of chemical space that might be advantageous for +downstream tasks. Yet, existing molecular SSL methods largely overlook chemical +knowledge, including molecular structure similarity, scaffold composition, and +the context-dependent aspects of molecular properties when operating over the +chemical space. They also struggle to learn the subtle variations in +structure-activity relationship. This paper introduces a novel pre-training +framework that learns robust and generalizable chemical knowledge. It leverages +the structural hierarchy within the molecule, embeds them through distinct +pre-training tasks across channels, and aggregates channel information in a +task-specific manner during fine-tuning. Our approach demonstrates competitive +performance across various molecular property benchmarks and offers strong +advantages in particularly challenging yet ubiquitous scenarios like activity +cliffs. + +
+
+
+
+
+ + ♻ ☆ SketchQL Demonstration: Zero-shot Video Moment Querying with Sketches + + +
+ In this paper, we will present SketchQL, a video database management system +(VDBMS) for retrieving video moments with a sketch-based query interface. This +novel interface allows users to specify object trajectory events with simple +mouse drag-and-drop operations. Users can use trajectories of single objects as +building blocks to compose complex events. Using a pre-trained model that +encodes trajectory similarity, SketchQL achieves zero-shot video moments +retrieval by performing similarity searches over the video to identify clips +that are the most similar to the visual query. In this demonstration, we +introduce the graphic user interface of SketchQL and detail its functionalities +and interaction mechanisms. We also demonstrate the end-to-end usage of +SketchQL from query composition to video moments retrieval using real-world +scenarios. + +
+
+
+
+
+ + ♻ ☆ A Survey on Deep Clustering: From the Prior Perspective + + +
+ Facilitated by the powerful feature extraction ability of neural networks, +deep clustering has achieved great success in analyzing high-dimensional and +complex real-world data. The performance of deep clustering methods is affected +by various factors such as network structures and learning objectives. However, +as pointed out in this survey, the essence of deep clustering lies in the +incorporation and utilization of prior knowledge, which is largely ignored by +existing works. From pioneering deep clustering methods based on data structure +assumptions to recent contrastive clustering methods based on data augmentation +invariances, the development of deep clustering intrinsically corresponds to +the evolution of prior knowledge. In this survey, we provide a comprehensive +review of deep clustering methods by categorizing them into six types of prior +knowledge. We find that in general the prior innovation follows two trends, +namely, i) from mining to constructing, and ii) from internal to external. +Besides, we provide a benchmark on five widely-used datasets and analyze the +performance of methods with diverse priors. By providing a novel prior +knowledge perspective, we hope this survey could provide some novel insights +and inspire future research in the deep clustering community. + +
+
+
+
+
+ + ♻ ☆ Towards Graph Foundation Models: A Survey and Beyond + + +
+ Foundation models have emerged as critical components in a variety of +artificial intelligence applications, and showcase significant success in +natural language processing and several other domains. Meanwhile, the field of +graph machine learning is witnessing a paradigm transition from shallow methods +to more sophisticated deep learning approaches. The capabilities of foundation +models to generalize and adapt motivate graph machine learning researchers to +discuss the potential of developing a new graph learning paradigm. This +paradigm envisions models that are pre-trained on extensive graph data and can +be adapted for various graph tasks. Despite this burgeoning interest, there is +a noticeable lack of clear definitions and systematic analyses pertaining to +this new domain. To this end, this article introduces the concept of Graph +Foundation Models (GFMs), and offers an exhaustive explanation of their key +characteristics and underlying technologies. We proceed to classify the +existing work related to GFMs into three distinct categories, based on their +dependence on graph neural networks and large language models. In addition to +providing a thorough review of the current state of GFMs, this article also +outlooks potential avenues for future research in this rapidly evolving domain. + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ SIDQL: An Efficient Keyframe Extraction and Motion Reconstruction + Framework in Motion Capture + + +
+ Metaverse, which integrates the virtual and physical worlds, has emerged as +an innovative paradigm for changing people's lifestyles. Motion capture has +become a reliable approach to achieve seamless synchronization of the movements +between avatars and human beings, which plays an important role in diverse +Metaverse applications. However, due to the continuous growth of data, current +communication systems face a significant challenge of meeting the demand of +ultra-low latency during application. In addition, current methods also have +shortcomings when selecting keyframes, e.g., relying on recognizing motion +types and artificially selected keyframes. Therefore, the utilization of +keyframe extraction and motion reconstruction techniques could be considered a +feasible and promising solution. In this work, a new motion reconstruction +algorithm is designed in a spherical coordinate system involving location and +velocity information. Then, we formalize the keyframe extraction problem into +an optimization problem to reduce the reconstruction error. Using Deep +Q-Learning (DQL), the Spherical Interpolation based Deep Q-Learning (SIDQL) +framework is proposed to generate proper keyframes for reconstructing the +motion sequences. We use the CMU database to train and evaluate the framework. +Our scheme can significantly reduce the data volume and transmission latency +compared to various baselines while maintaining a reconstruction error of less +than 0.09 when extracting five keyframes. + +
+
+
+
+
+ + ♻ ☆ Proceedings of The second international workshop on eXplainable AI for + the Arts (XAIxArts) + + +
+ This second international workshop on explainable AI for the Arts (XAIxArts) +brought together a community of researchers in HCI, Interaction Design, AI, +explainable AI (XAI), and digital arts to explore the role of XAI for the Arts. +Workshop held at the 16th ACM Conference on Creativity and Cognition (C&C +2024), Chicago, USA. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 23 + +
+
+
+ + ♻ ☆ Cost-Efficient Large Language Model Serving for Multi-turn Conversations + with CachedAttention ATC + + +
+ Interacting with humans through multi-turn conversations is a fundamental +feature of large language models (LLMs). However, existing LLM serving engines +executing multi-turn conversations are inefficient due to the need to +repeatedly compute the key-value (KV) caches of historical tokens, incurring +high serving costs. To address the problem, this paper proposes +CachedAttention, a new attention mechanism that enables reuse of KV caches +across multi-turn conversations, significantly reducing the repetitive +computation overheads. CachedAttention maintains a hierarchical KV caching +system that leverages cost-effective memory/storage mediums to save KV caches +for all requests. To reduce KV cache access overheads from slow mediums, +CachedAttention employs layer-wise pre-loading and asynchronous saving schemes +to overlap the KV cache access with the GPU computation. To ensure that the KV +caches to be accessed are placed in the fastest hierarchy, CachedAttention +employs scheduler-aware fetching and eviction schemes to consciously place the +KV caches in different layers based on the hints from the inference job +scheduler. To avoid the invalidation of the saved KV caches incurred by context +window overflow, CachedAttention enables the saved KV caches to remain valid +via decoupling the positional encoding and effectively truncating the KV +caches. Extensive experimental results demonstrate that CachedAttention +significantly decreases the time to the first token (TTFT) by up to 87%, +improves the prompt prefilling throughput by up to 7.8$\times$ for multi-turn +conversations, and reduces the end-to-end inference cost by up to 70%. + +
+
+ comment: Accepted to USENIX Annual Technical Conference (ATC) 2024 +
+
+
+
+
+ + ♻ ☆ LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient + Language Model Finetuning + + +
+ We propose a simple approach for memory-efficient adaptation of pretrained +language models. Our approach uses an iterative algorithm to decompose each +pretrained matrix into a high-precision low-rank component and a +memory-efficient quantized component. During finetuning, the quantized +component remains fixed and only the low-rank component is updated. We present +an integer linear programming formulation of the quantization component which +enables dynamic configuration of quantization parameters (e.g., bit-width, +block size) for each matrix given an overall target memory budget. We further +explore a data-aware version of the algorithm which uses an approximation of +the Fisher information matrix to weight the reconstruction objective during +matrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and +70B) demonstrate that our low-rank plus quantized matrix decomposition approach +(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables +aggressive quantization to sub-3 bits with only minor performance degradations. +When finetuned on a language modeling calibration dataset, LQ-LoRA can also be +used for model compression; in this setting our 2.75-bit LLaMA-2-70B model +(which has 2.85 bits on average when including the low-rank components and +requires 27GB of GPU memory) performs respectably compared to the 16-bit +baseline. + +
+
+
+
+
+ + ♻ ☆ X-ray Made Simple: Radiology Report Generation and Evaluation with + Layman's Terms + + +
+ Radiology Report Generation (RRG) has achieved significant progress with the +advancements of multimodal generative models. However, the evaluation in the +domain suffers from a lack of fair and robust metrics. We reveal that, high +performance on RRG with existing lexical-based metrics (e.g. BLEU) might be +more of a mirage - a model can get a high BLEU only by learning the template of +reports. This has become an urgent problem for RRG due to the highly +patternized nature of these reports. In this work, we un-intuitively approach +this problem by proposing the Layman's RRG framework, a layman's terms-based +dataset, evaluation and training framework that systematically improves RRG +with day-to-day language. We first contribute the translated Layman's terms +dataset. Building upon the dataset, we then propose a semantics-based +evaluation method, which is proved to mitigate the inflated numbers of BLEU and +provides fairer evaluation. Last, we show that training on the layman's terms +dataset encourages models to focus on the semantics of the reports, as opposed +to overfitting to learning the report templates. We reveal a promising scaling +law between the number of training examples and semantics gain provided by our +dataset, compared to the inverse pattern brought by the original formats. Our +code is available at \url{https://github.com/hegehongcha/LaymanRRG}. + +
+
+
+
+
+ + ♻ ☆ SimsChat: A Customisable Persona-Driven Role-Playing Agent + + +
+ Large Language Models (LLMs) possess the remarkable capability to understand +human instructions and generate high-quality text, enabling them to act as +agents that simulate human behaviours. This capability allows LLMs to emulate +human beings in a more advanced manner, beyond merely replicating simple human +behaviours. However, there is a lack of exploring into leveraging LLMs to craft +characters from several aspects. In this work, we introduce the Customisable +Conversation Agent Framework, which employs LLMs to simulate real-world +characters that can be freely customised according to different user +preferences. The customisable framework is helpful for designing customisable +characters and role-playing agents according to human's preferences. We first +propose the SimsConv dataset, which comprises 68 different customised +characters, 1,360 multi-turn role-playing dialogues, and encompasses 13,971 +interaction dialogues in total. The characters are created from several +real-world elements, such as career, aspiration, trait, and skill. Building on +these foundations, we present SimsChat, a freely customisable role-playing +agent. It incorporates different real-world scenes and topic-specific character +interaction dialogues, simulating characters' life experiences in various +scenarios and topic-specific interactions with specific emotions. Experimental +results show that our proposed framework achieves desirable performance and +provides helpful guideline for building better simulacra of human beings in the +future. Our data and code are available at +https://github.com/Bernard-Yang/SimsChat. + +
+
+
+
+
+ + ♻ ☆ PANGeA: Procedural Artificial Narrative using Generative AI for + Turn-Based Video Games + + +
+ This research introduces Procedural Artificial Narrative using Generative AI +(PANGeA), a structured approach for leveraging large language models (LLMs), +guided by a game designer's high-level criteria, to generate narrative content +for turn-based role-playing video games (RPGs). Distinct from prior +applications of LLMs used for video game design, PANGeA innovates by not only +generating game level data (which includes, but is not limited to, setting, key +items, and non-playable characters (NPCs)), but by also fostering dynamic, +free-form interactions between the player and the environment that align with +the procedural game narrative. The NPCs generated by PANGeA are +personality-biased and express traits from the Big 5 Personality Model in their +generated responses. PANGeA addresses challenges behind ingesting free-form +text input, which can prompt LLM responses beyond the scope of the game +narrative. A novel validation system that uses the LLM's intelligence evaluates +text input and aligns generated responses with the unfolding narrative. Making +these interactions possible, PANGeA is supported by a server that hosts a +custom memory system that supplies context for augmenting generated responses +thus aligning them with the procedural narrative. For its broad application, +the server has a REST interface enabling any game engine to integrate directly +with PANGeA, as well as an LLM interface adaptable with local or private LLMs. +PANGeA's ability to foster dynamic narrative generation by aligning responses +with the procedural narrative is demonstrated through an empirical study and +ablation test of two versions of a demo game. These are, a custom, +browser-based GPT and a Unity demo. As the results show, PANGeA holds potential +to assist game designers in using LLMs to generate narrative-consistent content +even when provided varied and unpredictable, free-form text input. + +
+
+
+
+
+ + ♻ ☆ SEMQA: Semi-Extractive Multi-Source Question Answering NAACL 2024 + + +
+ Recently proposed long-form question answering (QA) systems, supported by +large language models (LLMs), have shown promising capabilities. Yet, +attributing and verifying their generated abstractive answers can be difficult, +and automatically evaluating their accuracy remains an ongoing challenge. + In this work, we introduce a new QA task for answering multi-answer questions +by summarizing multiple diverse sources in a semi-extractive fashion. +Specifically, Semi-extractive Multi-source QA (SEMQA) requires models to output +a comprehensive answer, while mixing factual quoted spans -- copied verbatim +from given input sources -- and non-factual free-text connectors that glue +these spans together into a single cohesive passage. This setting bridges the +gap between the outputs of well-grounded but constrained extractive QA systems +and more fluent but harder to attribute fully abstractive answers. +Particularly, it enables a new mode for language models that leverages their +advanced language generation capabilities, while also producing fine in-line +attributions by-design that are easy to verify, interpret, and evaluate. + To study this task, we create the first dataset of this kind, QuoteSum, with +human-written semi-extractive answers to natural and generated questions, and +define text-based evaluation metrics. Experimenting with several LLMs in +various settings, we find this task to be surprisingly challenging, +demonstrating the importance of QuoteSum for developing and studying such +consolidation capabilities. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ AIM: Let Any Multi-modal Large Language Models Embrace Efficient + In-Context Learning + + +
+ In-context learning (ICL) facilitates Large Language Models (LLMs) exhibiting +emergent ability on downstream tasks without updating billions of parameters. +However, in the area of multi-modal Large Language Models (MLLMs), two problems +hinder the application of multi-modal ICL: (1) Most primary MLLMs are only +trained on single-image datasets, making them unable to read multi-modal +demonstrations. (2) With the demonstrations increasing, thousands of visual +tokens highly challenge hardware and degrade ICL performance. During +preliminary explorations, we discovered that the inner LLM tends to focus more +on the linguistic modality within multi-modal demonstrations to generate +responses. Therefore, we propose a general and light-weighted framework +\textbf{AIM} to tackle the mentioned problems through \textbf{A}ggregating +\textbf{I}mage information of \textbf{M}ultimodal demonstrations to the dense +latent space of the corresponding linguistic part. Specifically, AIM first uses +the frozen backbone MLLM to read each image-text demonstration and extracts the +vector representations on top of the text. These vectors naturally fuse the +information of the image-text pair, and AIM transforms them into fused virtual +tokens acceptable for the inner LLM via a trainable projection layer. +Ultimately, these fused tokens function as variants of multi-modal +demonstrations, fed into the MLLM to direct its response to the current query +as usual. Because these fused tokens stem from the textual component of the +image-text pair, a multi-modal demonstration is nearly reduced to a pure +textual demonstration, thus seamlessly applying to any MLLMs. With its de facto +MLLM frozen, AIM is parameter-efficient and we train it on public multi-modal +web corpora which have nothing to do with downstream test tasks. + +
+
+
+
+
+ + ♻ ☆ MedCalc-Bench: Evaluating Large Language Models for Medical Calculations + + +
+ As opposed to evaluating computation and logic-based reasoning, current +benchmarks for evaluating large language models (LLMs) in medicine are +primarily focused on question-answering involving domain knowledge and +descriptive reasoning. While such qualitative capabilities are vital to medical +diagnosis, in real-world scenarios, doctors frequently use clinical calculators +that follow quantitative equations and rule-based reasoning paradigms for +evidence-based decision support. To this end, we propose MedCalc-Bench, a +first-of-its-kind dataset focused on evaluating the medical calculation +capability of LLMs. MedCalc-Bench contains an evaluation set of over 1000 +manually reviewed instances from 55 different medical calculation tasks. Each +instance in MedCalc-Bench consists of a patient note, a question requesting to +compute a specific medical value, a ground truth answer, and a step-by-step +explanation showing how the answer is obtained. While our evaluation results +show the potential of LLMs in this area, none of them are effective enough for +clinical settings. Common issues include extracting the incorrect entities, not +using the correct equation or rules for a calculation task, or incorrectly +performing the arithmetic for the computation. We hope our study highlights the +quantitative knowledge and reasoning gaps in LLMs within medical settings, +encouraging future improvements of LLMs for various clinical calculation tasks. + +
+
+ comment: Github link: https://github.com/ncbi-nlp/MedCalc-Bench HuggingFace + link: https://huggingface.co/datasets/nsk7153/MedCalc-Bench +
+
+
+
+
+ + ♻ ☆ EAGLE-2: Faster Inference of Language Models with Dynamic Draft Trees + + +
+ Inference with modern Large Language Models (LLMs) is expensive and +time-consuming, and speculative sampling has proven to be an effective +solution. Most speculative sampling methods such as EAGLE use a static draft +tree, implicitly assuming that the acceptance rate of draft tokens depends only +on their position. Interestingly, we found that the acceptance rate of draft +tokens is also context-dependent. In this paper, building upon EAGLE, we +propose EAGLE-2, which introduces a new technique of context-aware dynamic +draft tree into drafting modeling. This improvement leverages the fact that the +draft model of EAGLE is well-calibrated: the confidence scores from the draft +model approximate acceptance rates with small errors. We conducted extensive +evaluations on three series of LLMs and six tasks, with EAGLE-2 achieving +speedup ratios 3.05x-4.26x, which is 20%-40% faster than EAGLE-1. EAGLE-2 also +ensures that the distribution of the generated text remains unchanged, making +it a lossless acceleration algorithm. + +
+
+
+
+
+ + ♻ ☆ LongRAG: Enhancing Retrieval-Augmented Generation with Long-context LLMs + + +
+ In traditional RAG framework, the basic retrieval units are normally short. +The common retrievers like DPR normally work with 100-word Wikipedia +paragraphs. Such a design forces the retriever to search over a large corpus to +find the `needle' unit. In contrast, the readers only need to extract answers +from the short retrieved units. Such an imbalanced `heavy' retriever and +`light' reader design can lead to sub-optimal performance. In order to +alleviate the imbalance, we propose a new framework LongRAG, consisting of a +`long retriever' and a `long reader'. LongRAG processes the entire Wikipedia +into 4K-token units, which is 30x longer than before. By increasing the unit +size, we significantly reduce the total units from 22M to 700K. This +significantly lowers the burden of retriever, which leads to a remarkable +retrieval score: answer recall@1=71% on NQ (previously 52%) and answer +recall@2=72% (previously 47%) on HotpotQA (full-wiki). Then we feed the top-k +retrieved units ($\approx$ 30K tokens) to an existing long-context LLM to +perform zero-shot answer extraction. Without requiring any training, LongRAG +achieves an EM of 62.7% on NQ, which is the best known result. LongRAG also +achieves 64.3% on HotpotQA (full-wiki), which is on par of the SoTA model. Our +study offers insights into the future roadmap for combining RAG with +long-context LLMs. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Fine-Tuning or Fine-Failing? Debunking Performance Myths in Large + Language Models + + +
+ Large Language Models (LLMs) have the unique capability to understand and +generate human-like text from input queries. When fine-tuned, these models show +enhanced performance on domain-specific queries. OpenAI highlights the process +of fine-tuning, stating: "To fine-tune a model, you are required to provide at +least 10 examples. We typically see clear improvements from fine-tuning on 50 +to 100 training examples, but the right number varies greatly based on the +exact use case." This study extends this concept to the integration of LLMs +within Retrieval-Augmented Generation (RAG) pipelines, which aim to improve +accuracy and relevance by leveraging external corpus data for information +retrieval. However, RAG's promise of delivering optimal responses often falls +short in complex query scenarios. This study aims to specifically examine the +effects of fine-tuning LLMs on their ability to extract and integrate +contextual data to enhance the performance of RAG systems across multiple +domains. We evaluate the impact of fine-tuning on the LLMs' capacity for data +extraction and contextual understanding by comparing the accuracy and +completeness of fine-tuned models against baseline performances across datasets +from multiple domains. Our findings indicate that fine-tuning resulted in a +decline in performance compared to the baseline models, contrary to the +improvements observed in standalone LLM applications as suggested by OpenAI. +This study highlights the need for vigorous investigation and validation of +fine-tuned models for domain-specific tasks. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ LLM Critics Help Catch Bugs in Mathematics: Towards a Better + Mathematical Verifier with Natural Language Feedback + + +
+ Mathematical verfier achieves success in mathematical reasoning tasks by +validating the correctness of solutions. However, existing verifiers are +trained with binary classification labels, which are not informative enough for +the model to accurately assess the solutions. To mitigate the aforementioned +insufficiency of binary labels, we introduce step-wise natural language +feedbacks as rationale labels (i.e., the correctness of the current step and +the explanations). In this paper, we propose \textbf{Math-Minos}, a natural +language feedback enhanced verifier by constructing automatically-generated +training data and a two-stage training paradigm for effective training and +efficient inference. Our experiments reveal that a small set (30k) of natural +language feedbacks can significantly boost the performance of the verifier by +the accuracy of 1.6\% (86.6\% $\rightarrow$ 88.2\%) on GSM8K and 0.8\% (37.8\% +$\rightarrow$ 38.6\%) on MATH. We have released our code and data for further +exploration. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ Can Many-Shot In-Context Learning Help Long-Context LLM Judges? See + More, Judge Better! + + +
+ Leveraging Large Language Models (LLMs) as judges for judging the performance +of LLMs has recently garnered attention. However, this type of approach is +affected by the potential biases in LLMs, raising concerns about the +reliability of the evaluation results. To mitigate this issue, we propose and +study two versions of many-shot in-context prompts, which rely on two existing +settings of many-shot ICL for helping GPT-4o-as-a-Judge in single answer +grading to mitigate the potential biases in LLMs, Reinforced ICL and +Unsupervised ICL. Concretely, the former utilizes in-context examples with +model-generated rationales, and the latter without. Based on the designed +prompts, we investigate the impact of scaling the number of in-context examples +on the consistency and quality of the judgment results. Furthermore, we reveal +the symbol bias hidden in the pairwise comparison of GPT-4o-as-a-Judge and +propose a simple yet effective approach to mitigate it. Experimental results +show that advanced long-context LLMs, such as GPT-4o, perform better in the +many-shot regime than in the zero-shot regime. Meanwhile, the experimental +results further verify the effectiveness of the symbol bias mitigation +approach. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ Identifying User Goals from UI Trajectories + + +
+ Autonomous agents that interact with graphical user interfaces (GUIs) hold +significant potential for enhancing user experiences. To further improve these +experiences, agents need to be personalized and proactive. By effectively +comprehending user intentions through their actions and interactions with GUIs, +agents will be better positioned to achieve these goals. This paper introduces +the task of goal identification from observed UI trajectories, aiming to infer +the user's intended task based on their GUI interactions. We propose a novel +evaluation metric to assess whether two task descriptions are paraphrases +within a specific UI environment. By Leveraging the inverse relation with the +UI automation task, we utilized the Android-In-The-Wild and Mind2Web datasets +for our experiments. Using our metric and these datasets, we conducted several +experiments comparing the performance of humans and state-of-the-art models, +specifically GPT-4 and Gemini-1.5 Pro. Our results show that Gemini performs +better than GPT but still underperforms compared to humans, indicating +significant room for improvement. + +
+
+
+
+
+ + ♻ ☆ On the Use of Large Language Models to Generate Capability Ontologies + + +
+ Capability ontologies are increasingly used to model functionalities of +systems or machines. The creation of such ontological models with all +properties and constraints of capabilities is very complex and can only be done +by ontology experts. However, Large Language Models (LLMs) have shown that they +can generate machine-interpretable models from natural language text input and +thus support engineers / ontology experts. Therefore, this paper investigates +how LLMs can be used to create capability ontologies. We present a study with a +series of experiments in which capabilities with varying complexities are +generated using different prompting techniques and with different LLMs. Errors +in the generated ontologies are recorded and compared. To analyze the quality +of the generated ontologies, a semi-automated approach based on RDF syntax +checking, OWL reasoning, and SHACL constraints is used. The results of this +study are very promising because even for complex capabilities, the generated +ontologies are almost free of errors. + +
+
+
+
+
+ + ♻ ☆ Small Language Models Learn Enhanced Reasoning Skills from Medical + Textbooks + + +
+ While recent advancements in commercial large language models (LM) have shown +promising results in medical tasks, their closed-source nature poses +significant privacy and security concerns, hindering their widespread use in +the medical field. Despite efforts to create open-source models, their limited +parameters often result in insufficient multi-step reasoning capabilities +required for solving complex medical problems. To address this, we introduce +Meerkat, a new family of medical AI systems ranging from 7 to 70 billion +parameters. The models were trained using our new synthetic dataset consisting +of high-quality chain-of-thought reasoning paths sourced from 18 medical +textbooks, along with diverse instruction-following datasets. Our systems +achieved remarkable accuracy across six medical benchmarks, surpassing the +previous best models such as MediTron and BioMistral, and GPT-3.5 by a large +margin. Notably, Meerkat-7B surpassed the passing threshold of the United +States Medical Licensing Examination (USMLE) for the first time for a +7B-parameter model, while Meerkat-70B outperformed GPT-4 by an average of 1.3%. +Additionally, Meerkat-70B correctly diagnosed 21 out of 38 complex clinical +cases, outperforming humans' 13.8 and closely matching GPT-4's 21.8. Our +systems offered more detailed free-form responses to clinical queries compared +to existing small models, approaching the performance level of large commercial +models. This significantly narrows the performance gap with large LMs, +showcasing its effectiveness in addressing complex medical challenges. + +
+
+ comment: Added new LLaMA-3-based models and experiments on NEJM case + challenges +
+
+
+
+
+ + ♻ ☆ StrucText-Eval: An Autogenerated Benchmark for Evaluating Large Language + Model's Ability in Structure-Rich Text Understanding + + +
+ Given the substantial volumes of structured data held by many companies, +enabling Large Language Models (LLMs) to directly understand structured text in +non-structured forms could significantly enhance their capabilities across +various business scenarios. To this end, we propose evaluation data generation +method for assessing LLM's ability in understanding the structure-rich text, +which generates structured data of controllable complexity based on manually +crafted question templates and generation rules. Building on this generation +method, we introduce StrucText-Eval, a benchmark comprising 6,032 questions +across 8 different structured languages and 29 specific tasks. Furthermore, +considering human proficiency in rule-based tasks, we also present +StrucText-Eval-Hard, which includes 3,016 questions designed to further examine +the gap between LLMs and human performance. Results indicate that the +best-performing LLM currently achieve an accuracy of 65.0\% on +StrucText-Eval-Hard, while human accuracy reaches up to 95.7\%. Moreover, while +fine-tuning using StrucText-Eval can enhance existing LLMs' understanding of +all structured languages, it does not necessarily improve performance across +all task types. The benchmark and generation codes are open sourced in +https://github.com/MikeGu721/StrucText-Eval + +
+
+
+
+
+ + ♻ ☆ TemPrompt: Multi-Task Prompt Learning for Temporal Relation Extraction + in RAG-based Crowdsourcing Systems + + +
+ Temporal relation extraction (TRE) aims to grasp the evolution of events or +actions, and thus shape the workflow of associated tasks, so it holds promise +in helping understand task requests initiated by requesters in crowdsourcing +systems. However, existing methods still struggle with limited and unevenly +distributed annotated data. Therefore, inspired by the abundant global +knowledge stored within pre-trained language models (PLMs), we propose a +multi-task prompt learning framework for TRE (TemPrompt), incorporating prompt +tuning and contrastive learning to tackle these issues. To elicit more +effective prompts for PLMs, we introduce a task-oriented prompt construction +approach that thoroughly takes the myriad factors of TRE into consideration for +automatic prompt generation. In addition, we present temporal event reasoning +as a supplement to bolster the model's focus on events and temporal cues. The +experimental results demonstrate that TemPrompt outperforms all compared +baselines across the majority of metrics under both standard and few-shot +settings. A case study is provided to validate its effectiveness in +crowdsourcing scenarios. + +
+
+ comment: I submitted the manuscript without obtaining consent from all + co-authors +
+
+
+
+
+ + ♻ ☆ Mamo: a Mathematical Modeling Benchmark with Solvers + + +
+ Mathematical modeling involves representing real-world phenomena, systems, or +problems using mathematical expressions and equations to analyze, understand, +and predict their behavior. Given that this process typically requires +experienced experts, there is an interest in exploring whether Large Language +Models (LLMs) can undertake mathematical modeling to potentially decrease human +labor. To evaluate of LLMs in mathematical modeling, we introduce a new +benchmark, Mamo, that transcends traditional result-oriented assessments. +Unlike conventional methods that primarily assess LLMs based on the accuracy of +solutions to mathematical problems, our approach offers deeper insight into the +modeling process itself. By focusing on the processes LLMs undertake rather +than the correctness of their final solutions, Mamo pioneers a novel evaluation +paradigm. This shift underscores the importance of understanding the inherent +modeling capabilities of LLMs, paving the way for a more nuanced and +comprehensive analysis of their problem-solving strategies. Our work marks a +significant advancement in the field, suggesting a new direction for future +research by emphasizing the evaluation of LLMs' modeling processes over the +mere correctness of answers. This benchmark not only facilitates a better +understanding of LLMs' mathematical modeling capabilities but also sets a new +standard for evaluating their performance in complex problem-solving scenarios. + +
+
+ comment: Project: https://github.com/FreedomIntelligence/Mamo Updates: 1. + include more models 2. minor modification of the metric with new results 3. + fix some typos 4. add error analysis with examples +
+
+
+
+
+ + ♻ ☆ Unveiling Themes in Judicial Proceedings: A Cross-Country Study Using + Topic Modeling on Legal Documents from India and the UK + + +
+ Legal documents are indispensable in every country for legal practices and +serve as the primary source of information regarding previous cases and +employed statutes. In today's world, with an increasing number of judicial +cases, it is crucial to systematically categorize past cases into subgroups, +which can then be utilized for upcoming cases and practices. Our primary focus +in this endeavor was to annotate cases using topic modeling algorithms such as +Latent Dirichlet Allocation, Non-Negative Matrix Factorization, and Bertopic +for a collection of lengthy legal documents from India and the UK. This step is +crucial for distinguishing the generated labels between the two countries, +highlighting the differences in the types of cases that arise in each +jurisdiction. Furthermore, an analysis of the timeline of cases from India was +conducted to discern the evolution of dominant topics over the years. + +
+
+
+
+
+ + ♻ ☆ Continual Learning of Large Language Models: A Comprehensive Survey + + +
+ The recent success of large language models (LLMs) trained on static, +pre-collected, general datasets has sparked numerous research directions and +applications. One such direction addresses the non-trivial challenge of +integrating pre-trained LLMs into dynamic data distributions, task structures, +and user preferences. Pre-trained LLMs, when tailored for specific needs, often +experience significant performance degradation in previous knowledge domains -- +a phenomenon known as "catastrophic forgetting". While extensively studied in +the continual learning (CL) community, it presents new manifestations in the +realm of LLMs. In this survey, we provide a comprehensive overview of the +current research progress on LLMs within the context of CL. This survey is +structured into four main sections: we first describe an overview of +continually learning LLMs, consisting of two directions of continuity: vertical +continuity (or vertical continual learning), i.e., continual adaptation from +general to specific capabilities, and horizontal continuity (or horizontal +continual learning), i.e., continual adaptation across time and domains +(Section 3). We then summarize three stages of learning LLMs in the context of +modern CL: Continual Pre-Training (CPT), Domain-Adaptive Pre-training (DAP), +and Continual Fine-Tuning (CFT) (Section 4). Then we provide an overview of +evaluation protocols for continual learning with LLMs, along with the current +available data sources (Section 5). Finally, we discuss intriguing questions +pertaining to continual learning for LLMs (Section 6). The full list of papers +examined in this survey is available at +https://github.com/Wang-ML-Lab/llm-continual-learning-survey. + +
+
+ comment: 47 pages, 2 figures, 4 tables. Work in progress +
+
+
+
+
+ + ♻ ☆ CFMatch: Aligning Automated Answer Equivalence Evaluation with Expert + Judgments For Open-Domain Question Answering + + +
+ Question answering (QA) can only make progress if we know if an answer is +correct, but for many of the most challenging and interesting QA examples, +current evaluation metrics to determine answer equivalence (AE) often do not +align with human judgments, particularly more verbose, free-form answers from +large language models (LLM). There are two challenges: a lack of data and that +models are too big: LLM-based scorers can correlate better with human judges, +but this task has only been tested on limited QA datasets, and even when +available, update of the model is limited because LLMs are large and often +expensive. We rectify both of these issues by providing clear and consistent +guidelines for evaluating AE in machine QA adopted from professional human QA +contests. We also introduce a combination of standard evaluation and a more +efficient, robust, and lightweight discriminate AE classifier-based matching +method (CFMatch, smaller than 1 MB), trained and validated to more accurately +evaluate answer correctness in accordance with adopted expert AE rules that are +more aligned with human judgments. + +
+
+ comment: A duplicate and polished version is in arXiv:2402.11161 +
+
+
+
+
+ + ♻ ☆ Biomedical Visual Instruction Tuning with Clinician Preference Alignment + + +
+ Recent advancements in multimodal foundation models have showcased impressive +capabilities in understanding and reasoning with visual and textual +information. Adapting these foundation models trained for general usage to +specialized domains like biomedicine requires large-scale domain-specific +instruction datasets. While existing works have explored curating such datasets +automatically, the resultant datasets are not explicitly aligned with domain +expertise. In this work, we propose a data-centric framework, Biomedical Visual +Instruction Tuning with Clinician Preference Alignment (BioMed-VITAL), that +incorporates clinician preferences into both stages of generating and selecting +instruction data for tuning biomedical multimodal foundation models. First, +during the generation stage, we prompt the GPT-4V generator with a diverse set +of clinician-selected demonstrations for preference-aligned data candidate +generation. Then, during the selection phase, we train a separate selection +model, which explicitly distills clinician and policy-guided model preferences +into a rating function to select high-quality data for medical instruction +tuning. Results show that the model tuned with the instruction-following data +from our method demonstrates a significant improvement in open visual chat +(18.5% relatively) and medical VQA (win rate up to 81.73%). Our +instruction-following data and models are available at BioMed-VITAL.github.io. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 23 + +
+
+
+ + ♻ ☆ Simplex Clustering via sBeta with Applications to Online Adjustment of + Black-Box Predictions + + +
+ We explore clustering the softmax predictions of deep neural networks and +introduce a novel probabilistic clustering method, referred to as k-sBetas. In +the general context of clustering discrete distributions, the existing methods +focused on exploring distortion measures tailored to simplex data, such as the +KL divergence, as alternatives to the standard Euclidean distance. We provide a +general maximum a posteriori (MAP) perspective of clustering distributions, +emphasizing that the statistical models underlying the existing +distortion-based methods may not be descriptive enough. Instead, we optimize a +mixed-variable objective measuring data conformity within each cluster to the +introduced sBeta density function, whose parameters are constrained and +estimated jointly with binary assignment variables. Our versatile formulation +approximates various parametric densities for modeling simplex data and enables +the control of the cluster-balance bias. This yields highly competitive +performances for the unsupervised adjustment of black-box model predictions in +various scenarios. Our code and comparisons with the existing +simplex-clustering approaches and our introduced softmax-prediction benchmarks +are publicly available: +https://github.com/fchiaroni/Clustering_Softmax_Predictions. + +
+
+
+
+
+ + ♻ ☆ A Linear Time and Space Local Point Cloud Geometry Encoder via + Vectorized Kernel Mixture (VecKM) ICML2024 + + +
+ We propose VecKM, a local point cloud geometry encoder that is descriptive +and efficient to compute. VecKM leverages a unique approach by vectorizing a +kernel mixture to represent the local point cloud. Such representation's +descriptiveness is supported by two theorems that validate its ability to +reconstruct and preserve the similarity of the local shape. Unlike existing +encoders downsampling the local point cloud, VecKM constructs the local +geometry encoding using all neighboring points, producing a more descriptive +encoding. Moreover, VecKM is efficient to compute and scalable to large point +cloud inputs: VecKM reduces the memory cost from $(n^2+nKd)$ to $(nd+np)$; and +reduces the major runtime cost from computing $nK$ MLPs to $n$ MLPs, where $n$ +is the size of the point cloud, $K$ is the neighborhood size, $d$ is the +encoding dimension, and $p$ is a marginal factor. The efficiency is due to +VecKM's unique factorizable property that eliminates the need of explicitly +grouping points into neighbors. In the normal estimation task, VecKM +demonstrates not only 100x faster inference speed but also highest accuracy and +strongest robustness. In classification and segmentation tasks, integrating +VecKM as a preprocessing module achieves consistently better performance than +the PointNet, PointNet++, and point transformer baselines, and runs +consistently faster by up to 10 times. + +
+
+ comment: ICML2024 Conference Paper +
+
+
+
+
+ + ♻ ☆ InstantSplat: Unbounded Sparse-view Pose-free Gaussian Splatting in 40 + Seconds + + +
+ While novel view synthesis (NVS) from a sparse set of images has advanced +significantly in 3D computer vision, it relies on precise initial estimation of +camera parameters using Structure-from-Motion (SfM). For instance, the recently +developed Gaussian Splatting depends heavily on the accuracy of SfM-derived +points and poses. However, SfM processes are time-consuming and often prove +unreliable in sparse-view scenarios, where matched features are scarce, leading +to accumulated errors and limited generalization capability across datasets. In +this study, we introduce a novel and efficient framework to enhance robust NVS +from sparse-view images. Our framework, InstantSplat, integrates multi-view +stereo(MVS) predictions with point-based representations to construct 3D +Gaussians of large-scale scenes from sparse-view data within seconds, +addressing the aforementioned performance and efficiency issues by SfM. +Specifically, InstantSplat generates densely populated surface points across +all training views and determines the initial camera parameters using +pixel-alignment. Nonetheless, the MVS points are not globally accurate, and the +pixel-wise prediction from all views results in an excessive Gaussian number, +yielding a overparameterized scene representation that compromises both +training speed and accuracy. To address this issue, we employ a grid-based, +confidence-aware Farthest Point Sampling to strategically position point +primitives at representative locations in parallel. Next, we enhance pose +accuracy and tune scene parameters through a gradient-based joint optimization +framework from self-supervision. By employing this simplified framework, +InstantSplat achieves a substantial reduction in training time, from hours to +mere seconds, and demonstrates robust performance across various numbers of +views in diverse datasets. + +
+
+ comment: Project Page: https://instantsplat.github.io/ +
+
+
+
+
+ + ♻ ☆ Inconsistency-Aware Cross-Attention for Audio-Visual Fusion in + Dimensional Emotion Recognition + + +
+ Leveraging complementary relationships across modalities has recently drawn a +lot of attention in multimodal emotion recognition. Most of the existing +approaches explored cross-attention to capture the complementary relationships +across the modalities. However, the modalities may also exhibit weak +complementary relationships, which may deteriorate the cross-attended features, +resulting in poor multimodal feature representations. To address this problem, +we propose Inconsistency-Aware Cross-Attention (IACA), which can adaptively +select the most relevant features on-the-fly based on the strong or weak +complementary relationships across audio and visual modalities. Specifically, +we design a two-stage gating mechanism that can adaptively select the +appropriate relevant features to deal with weak complementary relationships. +Extensive experiments are conducted on the challenging Aff-Wild2 dataset to +show the robustness of the proposed model. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2403.19554 +
+
+
+
+
+ + ♻ ☆ Vox-UDA: Voxel-wise Unsupervised Domain Adaptation for Cryo-Electron + Subtomogram Segmentation with Denoised Pseudo Labeling + + +
+ Cryo-Electron Tomography (cryo-ET) is a 3D imaging technology facilitating +the study of macromolecular structures at near-atomic resolution. Recent +volumetric segmentation approaches on cryo-ET images have drawn widespread +interest in biological sector. However, existing methods heavily rely on +manually labeled data, which requires highly professional skills, thereby +hindering the adoption of fully-supervised approaches for cryo-ET images. Some +unsupervised domain adaptation (UDA) approaches have been designed to enhance +the segmentation network performance using unlabeled data. However, applying +these methods directly to cryo-ET images segmentation tasks remains challenging +due to two main issues: 1) the source data, usually obtained through +simulation, contain a certain level of noise, while the target data, directly +collected from raw-data from real-world scenario, have unpredictable noise +levels. 2) the source data used for training typically consists of known +macromoleculars, while the target domain data are often unknown, causing the +model's segmenter to be biased towards these known macromolecules, leading to a +domain shift problem. To address these challenges, in this work, we introduce +the first voxel-wise unsupervised domain adaptation approach, termed Vox-UDA, +specifically for cryo-ET subtomogram segmentation. Vox-UDA incorporates a noise +generation module to simulate target-like noises in the source dataset for +cross-noise level adaptation. Additionally, we propose a denoised +pseudo-labeling strategy based on improved Bilateral Filter to alleviate the +domain shift problem. Experimental results on both simulated and real cryo-ET +subtomogram datasets demonstrate the superiority of our proposed approach +compared to state-of-the-art UDA methods. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ Common and Rare Fundus Diseases Identification Using Vision-Language + Foundation Model with Knowledge of Over 400 Diseases + + +
+ Previous foundation models for retinal images were pre-trained with limited +disease categories and knowledge base. Here we introduce RetiZero, a +vision-language foundation model that leverages knowledge from over 400 fundus +diseases. To RetiZero's pre-training, we compiled 341,896 fundus images paired +with text descriptions, sourced from public datasets, ophthalmic literature, +and online resources, encompassing a diverse range of diseases across multiple +ethnicities and countries. RetiZero exhibits superior performance in several +downstream tasks, including zero-shot disease recognition, image-to-image +retrieval, and internal- and cross-domain disease identification. In zero-shot +scenarios, RetiZero achieves Top5 accuracy scores of 0.8430 for 15 fundus +diseases and 0.7561 for 52 fundus diseases. For image retrieval, it achieves +Top5 scores of 0.9500 and 0.8860 for the same disease sets, respectively. +Clinical evaluations show that RetiZero's Top3 zero-shot performance surpasses +the average of 19 ophthalmologists from Singapore, China and the United States. +Furthermore, RetiZero significantly enhances clinicians' accuracy in diagnosing +fundus disease. These findings underscore the value of integrating the RetiZero +foundation model into clinical settings, where a variety of fundus diseases are +encountered. + +
+
+
+
+
+ + ♻ ☆ Outlier-Robust Geometric Perception: A Novel Thresholding-Based + Estimator with Intra-Class Variance Maximization + + +
+ Geometric perception problems are fundamental tasks in robotics and computer +vision. In real-world applications, they often encounter the inevitable issue +of outliers, preventing traditional algorithms from making correct estimates. +In this paper, we present a novel general-purpose robust estimator TIVM +(Thresholding with Intra-class Variance Maximization) that can collaborate with +standard non-minimal solvers to efficiently reject outliers for geometric +perception problems. First, we introduce the technique of intra-class variance +maximization to design a dynamic 2-group thresholding method on the measurement +residuals, aiming to distinctively separate inliers from outliers. Then, we +develop an iterative framework that robustly optimizes the model by approaching +the pure-inlier group using a multi-layered dynamic thresholding strategy as +subroutine, in which a self-adaptive mechanism for layer-number tuning is +further employed to minimize the user-defined parameters. We validate the +proposed estimator on 3 classic geometric perception problems: rotation +averaging, point cloud registration and category-level perception, and +experiments show that it is robust against 70--90\% of outliers and can +converge typically in only 3--15 iterations, much faster than state-of-the-art +robust solvers such as RANSAC, GNC and ADAPT. Furthermore, another highlight is +that: our estimator can retain approximately the same level of robustness even +when the inlier-noise statistics of the problem are fully unknown. + +
+
+
+
+
+ + ♻ ☆ Fortify the Guardian, Not the Treasure: Resilient Adversarial Detectors + + +
+ This paper presents RADAR-Robust Adversarial Detection via Adversarial +Retraining-an approach designed to enhance the robustness of adversarial +detectors against adaptive attacks, while maintaining classifier performance. +An adaptive attack is one where the attacker is aware of the defenses and +adapts their strategy accordingly. Our proposed method leverages adversarial +training to reinforce the ability to detect attacks, without compromising clean +accuracy. During the training phase, we integrate into the dataset adversarial +examples, which were optimized to fool both the classifier and the adversarial +detector, enabling the adversarial detector to learn and adapt to potential +attack scenarios. Experimental evaluations on the CIFAR-10 and SVHN datasets +demonstrate that our proposed algorithm significantly improves a detector's +ability to accurately identify adaptive adversarial attacks -- without +sacrificing clean accuracy. + +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: draftcls option +
+
+
+
+
+ + ♻ ☆ ViDiT-Q: Efficient and Accurate Quantization of Diffusion Transformers + for Image and Video Generation + + +
+ Diffusion transformers (DiTs) have exhibited remarkable performance in visual +generation tasks, such as generating realistic images or videos based on +textual instructions. However, larger model sizes and multi-frame processing +for video generation lead to increased computational and memory costs, posing +challenges for practical deployment on edge devices. Post-Training Quantization +(PTQ) is an effective method for reducing memory costs and computational +complexity. When quantizing diffusion transformers, we find that applying +existing diffusion quantization methods designed for U-Net faces challenges in +preserving quality. After analyzing the major challenges for quantizing +diffusion transformers, we design an improved quantization scheme: "ViDiT-Q": +Video and Image Diffusion Transformer Quantization) to address these issues. +Furthermore, we identify highly sensitive layers and timesteps hinder +quantization for lower bit-widths. To tackle this, we improve ViDiT-Q with a +novel metric-decoupled mixed-precision quantization method (ViDiT-Q-MP). We +validate the effectiveness of ViDiT-Q across a variety of text-to-image and +video models. While baseline quantization methods fail at W8A8 and produce +unreadable content at W4A8, ViDiT-Q achieves lossless W8A8 quantization. +ViDiTQ-MP achieves W4A8 with negligible visual quality degradation, resulting +in a 2.5x memory optimization and a 1.5x latency speedup. + +
+
+ comment: Project Page: https://a-suozhang.xyz/viditq.github.io/ +
+
+
+
+
+ + ♻ ☆ Learning to Adapt Foundation Model DINOv2 for Capsule Endoscopy + Diagnosis + + +
+ Foundation models have become prominent in computer vision, achieving notable +success in various tasks. However, their effectiveness largely depends on +pre-training with extensive datasets. Applying foundation models directly to +small datasets of capsule endoscopy images from scratch is challenging. +Pre-training on broad, general vision datasets is crucial for successfully +fine-tuning our model for specific tasks. In this work, we introduce a +simplified approach called Adapt foundation models with a low-rank adaptation +(LoRA) technique for easier customization. Our method, inspired by the DINOv2 +foundation model, applies low-rank adaptation learning to tailor foundation +models for capsule endoscopy diagnosis effectively. Unlike traditional +fine-tuning methods, our strategy includes LoRA layers designed to absorb +specific surgical domain knowledge. During the training process, we keep the +main model (the backbone encoder) fixed and focus on optimizing the LoRA layers +and the disease classification component. We tested our method on two publicly +available datasets for capsule endoscopy disease classification. The results +were impressive, with our model achieving 97.75% accuracy on the Kvasir-Capsule +dataset and 98.81% on the Kvasirv2 dataset. Our solution demonstrates that +foundation models can be adeptly adapted for capsule endoscopy diagnosis, +highlighting that mere reliance on straightforward fine-tuning or pre-trained +models from general computer vision tasks is inadequate for such specific +applications. + +
+
+ comment: To appear in ICBIR 2024 +
+
+
+
+
+ + ♻ ☆ PUDD: Towards Robust Multi-modal Prototype-based Deepfake Detection CVPR2024 + + +
+ Deepfake techniques generate highly realistic data, making it challenging for +humans to discern between actual and artificially generated images. Recent +advancements in deep learning-based deepfake detection methods, particularly +with diffusion models, have shown remarkable progress. However, there is a +growing demand for real-world applications to detect unseen individuals, +deepfake techniques, and scenarios. To address this limitation, we propose a +Prototype-based Unified Framework for Deepfake Detection (PUDD). PUDD offers a +detection system based on similarity, comparing input data against known +prototypes for video classification and identifying potential deepfakes or +previously unseen classes by analyzing drops in similarity. Our extensive +experiments reveal three key findings: (1) PUDD achieves an accuracy of 95.1% +on Celeb-DF, outperforming state-of-the-art deepfake detection methods; (2) +PUDD leverages image classification as the upstream task during training, +demonstrating promising performance in both image classification and deepfake +detection tasks during inference; (3) PUDD requires only 2.7 seconds for +retraining on new data and emits 10$^{5}$ times less carbon compared to the +state-of-the-art model, making it significantly more environmentally friendly. + +
+
+ comment: CVPR2024 +
+
+
+
+
+ + ♻ ☆ NaVid: Video-based VLM Plans the Next Step for Vision-and-Language + Navigation + + +
+ Vision-and-language navigation (VLN) stands as a key research problem of +Embodied AI, aiming at enabling agents to navigate in unseen environments +following linguistic instructions. In this field, generalization is a +long-standing challenge, either to out-of-distribution scenes or from Sim to +Real. In this paper, we propose NaVid, a video-based large vision language +model (VLM), to mitigate such a generalization gap. NaVid makes the first +endeavor to showcase the capability of VLMs to achieve state-of-the-art level +navigation performance without any maps, odometers, or depth inputs. Following +human instruction, NaVid only requires an on-the-fly video stream from a +monocular RGB camera equipped on the robot to output the next-step action. Our +formulation mimics how humans navigate and naturally gets rid of the problems +introduced by odometer noises, and the Sim2Real gaps from map or depth inputs. +Moreover, our video-based approach can effectively encode the historical +observations of robots as spatio-temporal contexts for decision making and +instruction following. We train NaVid with 510k navigation samples collected +from continuous environments, including action-planning and +instruction-reasoning samples, along with 763k large-scale web data. Extensive +experiments show that NaVid achieves state-of-the-art performance in simulation +environments and the real world, demonstrating superior cross-dataset and +Sim2Real transfer. We thus believe our proposed VLM approach plans the next +step for not only the navigation agents but also this research field. + +
+
+ comment: Accepted by Robotics: Science and Systems (RSS 2024) +
+
+
+
+
+ + ♻ ☆ IntegratedPIFu: Integrated Pixel Aligned Implicit Function for + Single-view Human Reconstruction ECCV 2022 + + +
+ We propose IntegratedPIFu, a new pixel aligned implicit model that builds on +the foundation set by PIFuHD. IntegratedPIFu shows how depth and human parsing +information can be predicted and capitalised upon in a pixel-aligned implicit +model. In addition, IntegratedPIFu introduces depth oriented sampling, a novel +training scheme that improve any pixel aligned implicit model ability to +reconstruct important human features without noisy artefacts. Lastly, +IntegratedPIFu presents a new architecture that, despite using less model +parameters than PIFuHD, is able to improves the structural correctness of +reconstructed meshes. Our results show that IntegratedPIFu significantly +outperforms existing state of the arts methods on single view human +reconstruction. Our code has been made available online. + +
+
+ comment: Accepted to ECCV 2022 +
+
+
+
+
+ + ♻ ☆ AdaTreeFormer: Few Shot Domain Adaptation for Tree Counting from a + Single High-Resolution Image SP + + +
+ The process of estimating and counting tree density using only a single +aerial or satellite image is a difficult task in the fields of photogrammetry +and remote sensing. However, it plays a crucial role in the management of +forests. The huge variety of trees in varied topography severely hinders tree +counting models to perform well. The purpose of this paper is to propose a +framework that is learnt from the source domain with sufficient labeled trees +and is adapted to the target domain with only a limited number of labeled +trees. Our method, termed as AdaTreeFormer, contains one shared encoder with a +hierarchical feature extraction scheme to extract robust features from the +source and target domains. It also consists of three subnets: two for +extracting self-domain attention maps from source and target domains +respectively and one for extracting cross-domain attention maps. For the +latter, an attention-to-adapt mechanism is introduced to distill relevant +information from different domains while generating tree density maps; a +hierarchical cross-domain feature alignment scheme is proposed that +progressively aligns the features from the source and target domains. We also +adopt adversarial learning into the framework to further reduce the gap between +source and target domains. Our AdaTreeFormer is evaluated on six designed +domain adaptation tasks using three tree counting datasets, \ie Jiangsu, +Yosemite, and London. Experimental results show that AdaTreeFormer +significantly surpasses the state of the art, \eg in the cross domain from the +Yosemite to Jiangsu dataset, it achieves a reduction of 15.9 points in terms of +the absolute counting errors and an increase of 10.8\% in the accuracy of the +detected trees' locations. The codes and datasets are available at +https://github.com/HAAClassic/AdaTreeFormer. + +
+
+ comment: Accepted in ISPRS Journal of Photogrammetry and Remote Sensing +
+
+
+
+
+ + ♻ ☆ Diffusion Schrödinger Bridge Models for High-Quality MR-to-CT + Synthesis for Head and Neck Proton Treatment Planning + + +
+ In recent advancements in proton therapy, MR-based treatment planning is +gaining momentum to minimize additional radiation exposure compared to +traditional CT-based methods. This transition highlights the critical need for +accurate MR-to-CT image synthesis, which is essential for precise proton dose +calculations. Our research introduces the Diffusion Schr\"odinger Bridge Models +(DSBM), an innovative approach for high-quality MR-to-CT synthesis. DSBM learns +the nonlinear diffusion processes between MR and CT data distributions. This +method improves upon traditional diffusion models by initiating synthesis from +the prior distribution rather than the Gaussian distribution, enhancing both +generation quality and efficiency. We validated the effectiveness of DSBM on a +head and neck cancer dataset, demonstrating its superiority over traditional +image synthesis methods through both image-level and dosimetric-level +evaluations. The effectiveness of DSBM in MR-based proton treatment planning +highlights its potential as a valuable tool in various clinical scenarios. + +
+
+ comment: International Conference on the use of Computers in Radiation therapy + (ICCR) +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for automated image data annotation: empirical + studies using text prompts from Grounding DINO + + +
+ Grounding DINO and the Segment Anything Model (SAM) have achieved impressive +performance in zero-shot object detection and image segmentation, respectively. +Together, they have a great potential to revolutionize applications in +zero-shot semantic segmentation or data annotation. Yet, in specialized domains +like medical image segmentation, objects of interest (e.g., organs, tissues, +and tumors) may not fall in existing class names. To address this problem, the +referring expression comprehension (REC) ability of Grounding DINO is leveraged +to detect arbitrary targets by their language descriptions. However, recent +studies have highlighted severe limitation of the REC framework in this +application setting owing to its tendency to make false positive predictions +when the target is absent in the given image. And, while this bottleneck is +central to the prospect of open-set semantic segmentation, it is still largely +unknown how much improvement can be achieved by studying the prediction errors. +To this end, we perform empirical studies on six publicly available datasets +across different domains and reveal that these errors consistently follow a +predictable pattern and can, thus, be mitigated by a simple strategy. +Specifically, we show that false positive detections with appreciable +confidence scores generally occupy large image areas and can usually be +filtered by their relative sizes. More importantly, we expect these +observations to inspire future research in improving REC-based detection and +automated segmentation. Meanwhile, we evaluate the performance of SAM on +multiple datasets from various specialized domains and report significant +improvements in segmentation performance and annotation time savings over +manual approaches. + +
+
+
+
+
+ + ♻ ☆ IDLS: Inverse Depth Line based Visual-Inertial SLAM + + +
+ For robust visual-inertial SLAM in perceptually-challenging indoor +environments,recent studies exploit line features to extract descriptive +information about scene structure to deal with the degeneracy of point +features. But existing point-line-based SLAM methods mainly use Pl\"ucker +matrix or orthogonal representation to represent a line, which needs to +calculate at least four variables to determine a line. Given the numerous line +features to determine in each frame, the overly flexible line representation +increases the computation burden and comprises the accuracy of the results. In +this paper, we propose inverse depth representation for a line, which models +each extracted line feature using only two variables, i.e., the inverse depths +of the two ending points. It exploits the fact that the projected line's pixel +coordinates on the image plane are rather accurate, which partially restrict +the line. Using this compact line presentation, Inverse Depth Line SLAM (IDLS) +is proposed to track the line features in SLAM in an accurate and efficient +way. A robust line triangulation method and a novel line re-projection error +model are introduced. And a two-step optimization method is proposed to firstly +determine the lines and then to estimate the camera poses in each frame. IDLS +is extensively evaluated in multiple perceptually-challenging datasets. The +results show it is more accurate, robust, and needs lower computational +overhead than the current state-of-the-art of point-line-based SLAM methods. + +
+
+
+
+
+ + ♻ ☆ CDFormer:When Degradation Prediction Embraces Diffusion Model for Blind + Image Super-Resolution + + +
+ Existing Blind image Super-Resolution (BSR) methods focus on estimating +either kernel or degradation information, but have long overlooked the +essential content details. In this paper, we propose a novel BSR approach, +Content-aware Degradation-driven Transformer (CDFormer), to capture both +degradation and content representations. However, low-resolution images cannot +provide enough content details, and thus we introduce a diffusion-based module +$CDFormer_{diff}$ to first learn Content Degradation Prior (CDP) in both low- +and high-resolution images, and then approximate the real distribution given +only low-resolution information. Moreover, we apply an adaptive SR network +$CDFormer_{SR}$ that effectively utilizes CDP to refine features. Compared to +previous diffusion-based SR methods, we treat the diffusion model as an +estimator that can overcome the limitations of expensive sampling time and +excessive diversity. Experiments show that CDFormer can outperform existing +methods, establishing a new state-of-the-art performance on various benchmarks +under blind settings. Codes and models will be available at +\href{https://github.com/I2-Multimedia-Lab/CDFormer}{https://github.com/I2-Multimedia-Lab/CDFormer}. + +
+
+
+
+
+ + ♻ ☆ Flash-VStream: Memory-Based Real-Time Understanding for Long Video + Streams + + +
+ Benefiting from the advancements in large language models and cross-modal +alignment, existing multi-modal video understanding methods have achieved +prominent performance in offline scenario. However, online video streams, as +one of the most common media forms in the real world, have seldom received +attention. Compared to offline videos, the 'dynamic' nature of online video +streams poses challenges for the direct application of existing models and +introduces new problems, such as the storage of extremely long-term +information, interaction between continuous visual content and 'asynchronous' +user questions. Therefore, in this paper we present Flash-VStream, a +video-language model that simulates the memory mechanism of human. Our model is +able to process extremely long video streams in real-time and respond to user +queries simultaneously. Compared to existing models, Flash-VStream achieves +significant reductions in inference latency and VRAM consumption, which is +intimately related to performing understanding of online streaming video. In +addition, given that existing video understanding benchmarks predominantly +concentrate on offline scenario, we propose VStream-QA, a novel question +answering benchmark specifically designed for online video streaming +understanding. Comparisons with popular existing methods on the proposed +benchmark demonstrate the superiority of our method for such challenging +setting. To verify the generalizability of our approach, we further evaluate it +on existing video understanding benchmarks and achieves state-of-the-art +performance in offline scenarios as well. All code, models, and datasets are +available at the https://invinciblewyq.github.io/vstream-page/ + +
+
+
+
+
+ + ♻ ☆ BMapEst: Estimation of Brain Tissue Probability Maps using a + Differentiable MRI Simulator + + +
+ Reconstructing digital brain phantoms in the form of voxel-based, +multi-channeled tissue probability maps for individual subjects is essential +for capturing brain anatomical variability, understanding neurological +diseases, as well as for testing image processing methods. We demonstrate the +first framework that estimates brain tissue probability maps (Grey Matter - GM, +White Matter - WM, and Cerebrospinal fluid - CSF) with the help of a +Physics-based differentiable MRI simulator that models the magnetization signal +at each voxel in the volume. Given an observed $T_1$/$T_2$-weighted MRI scan, +the corresponding clinical MRI sequence, and the MRI differentiable simulator, +we estimate the simulator's input probability maps by back-propagating the L2 +loss between the simulator's output and the $T_1$/$T_2$-weighted scan. This +approach has the significant advantage of not relying on any training data and +instead uses the strong inductive bias of the MRI simulator. We tested the +model on 20 scans from the BrainWeb database and demonstrated a highly accurate +reconstruction of GM, WM, and CSF. Our source code is available online: +https://github.com/BioMedAI-UCSC/BMapEst. + +
+
+
+
+
+ + ♻ ☆ Biomedical Visual Instruction Tuning with Clinician Preference Alignment + + +
+ Recent advancements in multimodal foundation models have showcased impressive +capabilities in understanding and reasoning with visual and textual +information. Adapting these foundation models trained for general usage to +specialized domains like biomedicine requires large-scale domain-specific +instruction datasets. While existing works have explored curating such datasets +automatically, the resultant datasets are not explicitly aligned with domain +expertise. In this work, we propose a data-centric framework, Biomedical Visual +Instruction Tuning with Clinician Preference Alignment (BioMed-VITAL), that +incorporates clinician preferences into both stages of generating and selecting +instruction data for tuning biomedical multimodal foundation models. First, +during the generation stage, we prompt the GPT-4V generator with a diverse set +of clinician-selected demonstrations for preference-aligned data candidate +generation. Then, during the selection phase, we train a separate selection +model, which explicitly distills clinician and policy-guided model preferences +into a rating function to select high-quality data for medical instruction +tuning. Results show that the model tuned with the instruction-following data +from our method demonstrates a significant improvement in open visual chat +(18.5% relatively) and medical VQA (win rate up to 81.73%). Our +instruction-following data and models are available at BioMed-VITAL.github.io. + +
+
+
+
+
+ + ♻ ☆ AnoFPDM: Anomaly Segmentation with Forward Process of Diffusion Models + for Brain MRI + + +
+ Weakly-supervised diffusion models (DMs) in anomaly segmentation, leveraging +image-level labels, have attracted significant attention for their superior +performance compared to unsupervised methods. It eliminates the need for +pixel-level labels in training, offering a more cost-effective alternative to +supervised methods. However, existing methods are not fully weakly-supervised +because they heavily rely on costly pixel-level labels for hyperparameter +tuning in inference. To tackle this challenge, we introduce Anomaly +Segmentation with Forward Process of Diffusion Models (AnoFPDM), a fully +weakly-supervised framework that operates without the need of pixel-level +labels. Leveraging the unguided forward process as a reference for the guided +forward process, we select hyperparameters such as the noise scale, the +threshold for segmentation and the guidance strength. We aggregate anomaly maps +from guided forward process, enhancing the signal strength of anomalous +regions. Remarkably, our proposed method outperforms recent state-of-the-art +weakly-supervised approaches, even without utilizing pixel-level labels. + +
+
+ comment: v2: updated introduction, experiments and supplementary material +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ Prediction of Sentinel-2 multi-band imagery with attention BiLSTM for + continuous earth surface monitoring + + +
+ Continuous monitoring of crops and forecasting crop conditions through time +series analysis is crucial for effective agricultural management. This study +proposes a framework based on an attention Bidirectional Long Short-Term Memory +(BiLSTM) network for predicting multiband images. Our model can forecast target +images on user-defined dates, including future dates and periods characterized +by persistent cloud cover. By focusing on short sequences within a +sequence-to-one forecasting framework, the model leverages advanced attention +mechanisms to enhance prediction accuracy. Our experimental results demonstrate +the model's superior performance in predicting NDVI, multiple vegetation +indices, and all Sentinel-2 bands, highlighting its potential for improving +remote sensing data continuity and reliability. + +
+
+
+
+
+ + ☆ Enhancing Travel Decision-Making: A Contrastive Learning Approach for + Personalized Review Rankings in Accommodations + + +
+ User-generated reviews significantly influence consumer decisions, +particularly in the travel domain when selecting accommodations. This paper +contribution comprising two main elements. Firstly, we present a novel dataset +of authentic guest reviews sourced from a prominent online travel platform, +totaling over two million reviews from 50,000 distinct accommodations. +Secondly, we propose an innovative approach for personalized review ranking. +Our method employs contrastive learning to intricately capture the relationship +between a review and the contextual information of its respective reviewer. +Through a comprehensive experimental study, we demonstrate that our approach +surpasses several baselines across all reported metrics. Augmented by a +comparative analysis, we showcase the efficacy of our method in elevating +personalized review ranking. The implications of our research extend beyond the +travel domain, with potential applications in other sectors where personalized +review ranking is paramount, such as online e-commerce platforms. + +
+
+
+
+
+ + ☆ Dense Retrieval with Continuous Explicit Feedback for Systematic Review + Screening Prioritisation SIGIR 2024 + + +
+ The goal of screening prioritisation in systematic reviews is to identify +relevant documents with high recall and rank them in early positions for +review. This saves reviewing effort if paired with a stopping criterion, and +speeds up review completion if performed alongside downstream tasks. Recent +studies have shown that neural models have good potential on this task, but +their time-consuming fine-tuning and inference discourage their widespread use +for screening prioritisation. In this paper, we propose an alternative approach +that still relies on neural models, but leverages dense representations and +relevance feedback to enhance screening prioritisation, without the need for +costly model fine-tuning and inference. This method exploits continuous +relevance feedback from reviewers during document screening to efficiently +update the dense query representation, which is then applied to rank the +remaining documents to be screened. We evaluate this approach across the CLEF +TAR datasets for this task. Results suggest that the investigated dense +query-driven approach is more efficient than directly using neural models and +shows promising effectiveness compared to previous methods developed on the +considered datasets. Our code is available at +https://github.com/ielab/dense-screening-feedback. + +
+
+ comment: Accepted at SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Efficient and Effective Unlearning of Large Language Models for + Recommendation + + +
+ The significant advancements in large language models (LLMs) give rise to a +promising research direction, i.e., leveraging LLMs as recommenders (LLMRec). +The efficacy of LLMRec arises from the open-world knowledge and reasoning +capabilities inherent in LLMs. LLMRec acquires the recommendation capabilities +through instruction tuning based on user interaction data. However, in order to +protect user privacy and optimize utility, it is also crucial for LLMRec to +intentionally forget specific user data, which is generally referred to as +recommendation unlearning. In the era of LLMs, recommendation unlearning poses +new challenges for LLMRec in terms of \textit{inefficiency} and +\textit{ineffectiveness}. Existing unlearning methods require updating billions +of parameters in LLMRec, which is costly and time-consuming. Besides, they +always impact the model utility during the unlearning process. To this end, we +propose \textbf{E2URec}, the first \underline{E}fficient and +\underline{E}ffective \underline{U}nlearning method for LLM\underline{Rec}. Our +proposed E2URec enhances the unlearning efficiency by updating only a few +additional LoRA parameters, and improves the unlearning effectiveness by +employing a teacher-student framework, where we maintain multiple teacher +networks to guide the unlearning process. Extensive experiments show that +E2URec outperforms state-of-the-art baselines on two real-world datasets. +Specifically, E2URec can efficiently forget specific data without affecting +recommendation performance. The source code is at +\url{https://github.com/justarter/E2URec}. + +
+
+ comment: Accepted by Frontier of Computer Science +
+
+
+
+
+ + ♻ ☆ LLM-Powered Explanations: Unraveling Recommendations Through Subgraph + Reasoning + + +
+ Recommender systems are pivotal in enhancing user experiences across various +web applications by analyzing the complicated relationships between users and +items. Knowledge graphs(KGs) have been widely used to enhance the performance +of recommender systems. However, KGs are known to be noisy and incomplete, +which are hard to provide reliable explanations for recommendation results. An +explainable recommender system is crucial for the product development and +subsequent decision-making. To address these challenges, we introduce a novel +recommender that synergies Large Language Models (LLMs) and KGs to enhance the +recommendation and provide interpretable results. Specifically, we first +harness the power of LLMs to augment KG reconstruction. LLMs comprehend and +decompose user reviews into new triples that are added into KG. In this way, we +can enrich KGs with explainable paths that express user preferences. To enhance +the recommendation on augmented KGs, we introduce a novel subgraph reasoning +module that effectively measures the importance of nodes and discovers +reasoning for recommendation. Finally, these reasoning paths are fed into the +LLMs to generate interpretable explanations of the recommendation results. Our +approach significantly enhances both the effectiveness and interpretability of +recommender systems, especially in cross-selling scenarios where traditional +methods falter. The effectiveness of our approach has been rigorously tested on +four open real-world datasets, with our methods demonstrating a superior +performance over contemporary state-of-the-art techniques by an average +improvement of 12%. The application of our model in a multinational engineering +and technology company cross-selling recommendation system further underscores +its practical utility and potential to redefine recommendation practices +through improved accuracy and user trust. + +
+
+
+
+
+
+
+
+ + Machine Learning 37 + +
+
+
+ + ♻ ☆ Cost-Efficient Large Language Model Serving for Multi-turn Conversations + with CachedAttention ATC + + +
+ Interacting with humans through multi-turn conversations is a fundamental +feature of large language models (LLMs). However, existing LLM serving engines +executing multi-turn conversations are inefficient due to the need to +repeatedly compute the key-value (KV) caches of historical tokens, incurring +high serving costs. To address the problem, this paper proposes +CachedAttention, a new attention mechanism that enables reuse of KV caches +across multi-turn conversations, significantly reducing the repetitive +computation overheads. CachedAttention maintains a hierarchical KV caching +system that leverages cost-effective memory/storage mediums to save KV caches +for all requests. To reduce KV cache access overheads from slow mediums, +CachedAttention employs layer-wise pre-loading and asynchronous saving schemes +to overlap the KV cache access with the GPU computation. To ensure that the KV +caches to be accessed are placed in the fastest hierarchy, CachedAttention +employs scheduler-aware fetching and eviction schemes to consciously place the +KV caches in different layers based on the hints from the inference job +scheduler. To avoid the invalidation of the saved KV caches incurred by context +window overflow, CachedAttention enables the saved KV caches to remain valid +via decoupling the positional encoding and effectively truncating the KV +caches. Extensive experimental results demonstrate that CachedAttention +significantly decreases the time to the first token (TTFT) by up to 87%, +improves the prompt prefilling throughput by up to 7.8$\times$ for multi-turn +conversations, and reduces the end-to-end inference cost by up to 70%. + +
+
+ comment: Accepted to USENIX Annual Technical Conference (ATC) 2024 +
+
+
+
+
+ + ♻ ☆ Simplex Clustering via sBeta with Applications to Online Adjustment of + Black-Box Predictions + + +
+ We explore clustering the softmax predictions of deep neural networks and +introduce a novel probabilistic clustering method, referred to as k-sBetas. In +the general context of clustering discrete distributions, the existing methods +focused on exploring distortion measures tailored to simplex data, such as the +KL divergence, as alternatives to the standard Euclidean distance. We provide a +general maximum a posteriori (MAP) perspective of clustering distributions, +emphasizing that the statistical models underlying the existing +distortion-based methods may not be descriptive enough. Instead, we optimize a +mixed-variable objective measuring data conformity within each cluster to the +introduced sBeta density function, whose parameters are constrained and +estimated jointly with binary assignment variables. Our versatile formulation +approximates various parametric densities for modeling simplex data and enables +the control of the cluster-balance bias. This yields highly competitive +performances for the unsupervised adjustment of black-box model predictions in +various scenarios. Our code and comparisons with the existing +simplex-clustering approaches and our introduced softmax-prediction benchmarks +are publicly available: +https://github.com/fchiaroni/Clustering_Softmax_Predictions. + +
+
+
+
+
+ + ♻ ☆ LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient + Language Model Finetuning + + +
+ We propose a simple approach for memory-efficient adaptation of pretrained +language models. Our approach uses an iterative algorithm to decompose each +pretrained matrix into a high-precision low-rank component and a +memory-efficient quantized component. During finetuning, the quantized +component remains fixed and only the low-rank component is updated. We present +an integer linear programming formulation of the quantization component which +enables dynamic configuration of quantization parameters (e.g., bit-width, +block size) for each matrix given an overall target memory budget. We further +explore a data-aware version of the algorithm which uses an approximation of +the Fisher information matrix to weight the reconstruction objective during +matrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and +70B) demonstrate that our low-rank plus quantized matrix decomposition approach +(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables +aggressive quantization to sub-3 bits with only minor performance degradations. +When finetuned on a language modeling calibration dataset, LQ-LoRA can also be +used for model compression; in this setting our 2.75-bit LLaMA-2-70B model +(which has 2.85 bits on average when including the low-rank components and +requires 27GB of GPU memory) performs respectably compared to the 16-bit +baseline. + +
+
+
+
+
+ + ♻ ☆ Causality Pursuit from Heterogeneous Environments via Neural Adversarial + Invariance Learning + + +
+ Pursuing causality from data is a fundamental problem in scientific +discovery, treatment intervention, and transfer learning. This paper introduces +a novel algorithmic method for addressing nonparametric invariance and +causality learning in regression models across multiple environments, where the +joint distribution of response variables and covariates varies, but the +conditional expectations of outcome given an unknown set of quasi-causal +variables are invariant. The challenge of finding such an unknown set of +quasi-causal or invariant variables is compounded by the presence of endogenous +variables that have heterogeneous effects across different environments, +including even one of them in the regression would make the estimation +inconsistent. The proposed Focused Adversial Invariant Regularization (FAIR) +framework utilizes an innovative minimax optimization approach that breaks down +the barriers, driving regression models toward prediction-invariant solutions +through adversarial testing. Leveraging the representation power of neural +networks, FAIR neural networks (FAIR-NN) are introduced for causality pursuit. +It is shown that FAIR-NN can find the invariant variables and quasi-causal +variables under a minimal identification condition and that the resulting +procedure is adaptive to low-dimensional composition structures in a +non-asymptotic analysis. Under a structural causal model, variables identified +by FAIR-NN represent pragmatic causality and provably align with exact causal +mechanisms under conditions of sufficient heterogeneity. Computationally, +FAIR-NN employs a novel Gumbel approximation with decreased temperature and +stochastic gradient descent ascent algorithm. The procedures are convincingly +demonstrated using simulated and real-data examples. + +
+
+ comment: 48 pages, 7 figures with appendix +
+
+
+
+
+ + ♻ ☆ Accelerating Reinforcement Learning with Value-Conditional State Entropy + Exploration NeurIPS 2024 + + +
+ A promising technique for exploration is to maximize the entropy of visited +state distribution, i.e., state entropy, by encouraging uniform coverage of +visited state space. While it has been effective for an unsupervised setup, it +tends to struggle in a supervised setup with a task reward, where an agent +prefers to visit high-value states to exploit the task reward. Such a +preference can cause an imbalance between the distributions of high-value +states and low-value states, which biases exploration towards low-value state +regions as a result of the state entropy increasing when the distribution +becomes more uniform. This issue is exacerbated when high-value states are +narrowly distributed within the state space, making it difficult for the agent +to complete the tasks. In this paper, we present a novel exploration technique +that maximizes the value-conditional state entropy, which separately estimates +the state entropies that are conditioned on the value estimates of each state, +then maximizes their average. By only considering the visited states with +similar value estimates for computing the intrinsic bonus, our method prevents +the distribution of low-value states from affecting exploration around +high-value states, and vice versa. We demonstrate that the proposed alternative +to the state entropy baseline significantly accelerates various reinforcement +learning algorithms across a variety of tasks within MiniGrid, DeepMind Control +Suite, and Meta-World benchmarks. Source code is available at +https://sites.google.com/view/rl-vcse. + +
+
+ comment: NeurIPS 2024. Project webpage: https://sites.google.com/view/rl-vcse +
+
+
+
+
+ + ♻ ☆ SEMQA: Semi-Extractive Multi-Source Question Answering NAACL 2024 + + +
+ Recently proposed long-form question answering (QA) systems, supported by +large language models (LLMs), have shown promising capabilities. Yet, +attributing and verifying their generated abstractive answers can be difficult, +and automatically evaluating their accuracy remains an ongoing challenge. + In this work, we introduce a new QA task for answering multi-answer questions +by summarizing multiple diverse sources in a semi-extractive fashion. +Specifically, Semi-extractive Multi-source QA (SEMQA) requires models to output +a comprehensive answer, while mixing factual quoted spans -- copied verbatim +from given input sources -- and non-factual free-text connectors that glue +these spans together into a single cohesive passage. This setting bridges the +gap between the outputs of well-grounded but constrained extractive QA systems +and more fluent but harder to attribute fully abstractive answers. +Particularly, it enables a new mode for language models that leverages their +advanced language generation capabilities, while also producing fine in-line +attributions by-design that are easy to verify, interpret, and evaluate. + To study this task, we create the first dataset of this kind, QuoteSum, with +human-written semi-extractive answers to natural and generated questions, and +define text-based evaluation metrics. Experimenting with several LLMs in +various settings, we find this task to be surprisingly challenging, +demonstrating the importance of QuoteSum for developing and studying such +consolidation capabilities. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Novel Node Category Detection Under Subpopulation Shift ECML-PKDD 2024 + + +
+ In real-world graph data, distribution shifts can manifest in various ways, +such as the emergence of new categories and changes in the relative proportions +of existing categories. It is often important to detect nodes of novel +categories under such distribution shifts for safety or insight discovery +purposes. We introduce a new approach, Recall-Constrained Optimization with +Selective Link Prediction (RECO-SLIP), to detect nodes belonging to novel +categories in attributed graphs under subpopulation shifts. By integrating a +recall-constrained learning framework with a sample-efficient link prediction +mechanism, RECO-SLIP addresses the dual challenges of resilience against +subpopulation shifts and the effective exploitation of graph structure. Our +extensive empirical evaluation across multiple graph datasets demonstrates the +superior performance of RECO-SLIP over existing methods. The experimental code +is available at https://github.com/hsinghuan/novel-node-category-detection. + +
+
+ comment: Accepted to ECML-PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Conformal Depression Prediction + + +
+ While existing depression prediction methods based on deep learning show +promise, their practical application is hindered by the lack of +trustworthiness, as these deep models are often deployed as \textit{black box} +models, leaving us uncertain about the confidence of the model predictions. For +high-risk clinical applications like depression prediction, uncertainty +quantification is essential in decision-making. In this paper, we introduce +conformal depression prediction (CDP), a depression prediction method with +uncertainty quantification based on conformal prediction (CP), giving valid +confidence intervals with theoretical coverage guarantees for the model +predictions. CDP is a plug-and-play module that requires neither model +retraining nor an assumption about the depression data distribution. As CDP +provides only an average coverage guarantee across all inputs rather than +per-input performance guarantee, we further propose CDP-ACC, an improved +conformal prediction with approximate conditional coverage. CDP-ACC firstly +estimates the prediction distribution through neighborhood relaxation, and then +introduces a conformal score function by constructing nested sequences, so as +to provide a tighter prediction interval for each specific input. We +empirically demonstrate the application of CDP in uncertainty-aware depression +prediction, as well as the effectiveness and superiority of CDP-ACC on the AVEC +2013 and AVEC 2014 datasets. + +
+
+
+
+
+ + ♻ ☆ The Benefits of Reusing Batches for Gradient Descent in Two-Layer + Networks: Breaking the Curse of Information and Leap Exponents ICML + + +
+ We investigate the training dynamics of two-layer neural networks when +learning multi-index target functions. We focus on multi-pass gradient descent +(GD) that reuses the batches multiple times and show that it significantly +changes the conclusion about which functions are learnable compared to +single-pass gradient descent. In particular, multi-pass GD with finite stepsize +is found to overcome the limitations of gradient flow and single-pass GD given +by the information exponent (Ben Arous et al., 2021) and leap exponent (Abbe et +al., 2023) of the target function. We show that upon re-using batches, the +network achieves in just two time steps an overlap with the target subspace +even for functions not satisfying the staircase property (Abbe et al., 2021). +We characterize the (broad) class of functions efficiently learned in finite +time. The proof of our results is based on the analysis of the Dynamical +Mean-Field Theory (DMFT). We further provide a closed-form description of the +dynamical process of the low-dimensional projections of the weights, and +numerical experiments illustrating the theory. + +
+
+ comment: Accepted at the International Conference on Machine Learning (ICML), + 2024 +
+
+
+
+
+ + ♻ ☆ Braced Fourier Continuation and Regression for Anomaly Detection + + +
+ In this work, the concept of Braced Fourier Continuation and Regression +(BFCR) is introduced. BFCR is a novel and computationally efficient means of +finding nonlinear regressions or trend lines in arbitrary one-dimensional data +sets. The Braced Fourier Continuation (BFC) and BFCR algorithms are first +outlined, followed by a discussion of the properties of BFCR as well as +demonstrations of how BFCR trend lines may be used effectively for anomaly +detection both within and at the edges of arbitrary one-dimensional data sets. +Finally, potential issues which may arise while using BFCR for anomaly +detection as well as possible mitigation techniques are outlined and discussed. +All source code and example data sets are either referenced or available via +GitHub, and all associated code is written entirely in Python. + +
+
+ comment: 16 pages, 9 figures, associated Github link: + https://github.com/j4sabuda/Braced-Fourier-Continuation-and-Regression + -6/30/2024 update corrected and reworded erroneous figure references, minor + typos +
+
+
+
+
+ + ♻ ☆ Langevin dynamics based algorithm e-TH$\varepsilon$O POULA for + stochastic optimization problems with discontinuous stochastic gradient + + +
+ We introduce a new Langevin dynamics based algorithm, called +e-TH$\varepsilon$O POULA, to solve optimization problems with discontinuous +stochastic gradients which naturally appear in real-world applications such as +quantile estimation, vector quantization, CVaR minimization, and regularized +optimization problems involving ReLU neural networks. We demonstrate both +theoretically and numerically the applicability of the e-TH$\varepsilon$O POULA +algorithm. More precisely, under the conditions that the stochastic gradient is +locally Lipschitz in average and satisfies a certain convexity at infinity +condition, we establish non-asymptotic error bounds for e-TH$\varepsilon$O +POULA in Wasserstein distances and provide a non-asymptotic estimate for the +expected excess risk, which can be controlled to be arbitrarily small. Three +key applications in finance and insurance are provided, namely, multi-period +portfolio optimization, transfer learning in multi-period portfolio +optimization, and insurance claim prediction, which involve neural networks +with (Leaky)-ReLU activation functions. Numerical experiments conducted using +real-world datasets illustrate the superior empirical performance of +e-TH$\varepsilon$O POULA compared to SGLD, TUSLA, ADAM, and AMSGrad in terms of +model accuracy. + +
+
+
+
+
+ + ♻ ☆ On the convergence of nonlinear averaging dynamics with three-body + interactions on hypergraphs + + +
+ Complex networked systems in fields such as physics, biology, and social +sciences often involve interactions that extend beyond simple pairwise ones. +Hypergraphs serve as powerful modeling tools for describing and analyzing the +intricate behaviors of systems with multi-body interactions. Herein, we +investigate a discrete-time nonlinear averaging dynamics with three-body +interactions: an underlying hypergraph, comprising triples as hyperedges, +delineates the structure of these interactions, while the vertices update their +states through a weighted, state-dependent average of neighboring pairs' +states. This dynamics captures reinforcing group effects, such as peer +pressure, and exhibits higher-order dynamical effects resulting from a complex +interplay between initial states, hypergraph topology, and nonlinearity of the +update. Differently from linear averaging dynamics on graphs with two-body +interactions, this model does not converge to the average of the initial states +but rather induces a shift. By assuming random initial states and by making +some regularity and density assumptions on the hypergraph, we prove that the +dynamics converges to a multiplicatively-shifted average of the initial states, +with high probability. We further characterize the shift as a function of two +parameters describing the initial state and interaction strength, as well as +the convergence time as a function of the hypergraph structure. + +
+
+ comment: To appear in SIAM Journal on Applied Dynamical Systems +
+
+
+
+
+ + ♻ ☆ EAGLE-2: Faster Inference of Language Models with Dynamic Draft Trees + + +
+ Inference with modern Large Language Models (LLMs) is expensive and +time-consuming, and speculative sampling has proven to be an effective +solution. Most speculative sampling methods such as EAGLE use a static draft +tree, implicitly assuming that the acceptance rate of draft tokens depends only +on their position. Interestingly, we found that the acceptance rate of draft +tokens is also context-dependent. In this paper, building upon EAGLE, we +propose EAGLE-2, which introduces a new technique of context-aware dynamic +draft tree into drafting modeling. This improvement leverages the fact that the +draft model of EAGLE is well-calibrated: the confidence scores from the draft +model approximate acceptance rates with small errors. We conducted extensive +evaluations on three series of LLMs and six tasks, with EAGLE-2 achieving +speedup ratios 3.05x-4.26x, which is 20%-40% faster than EAGLE-1. EAGLE-2 also +ensures that the distribution of the generated text remains unchanged, making +it a lossless acceleration algorithm. + +
+
+
+
+
+ + ♻ ☆ RoboGPT: an intelligent agent of making embodied long-term decisions for + daily instruction tasks + + +
+ Robotic agents must master common sense and long-term sequential decisions to +solve daily tasks through natural language instruction. The developments in +Large Language Models (LLMs) in natural language processing have inspired +efforts to use LLMs in complex robot planning. Despite LLMs' great +generalization and comprehension of instruction tasks, LLMs-generated task +plans sometimes lack feasibility and correctness. To address the problem, we +propose a RoboGPT agent\footnote{our code and dataset will be released soon} +for making embodied long-term decisions for daily tasks, with two modules: 1) +LLMs-based planning with re-plan to break the task into multiple sub-goals; 2) +RoboSkill individually designed for sub-goals to learn better navigation and +manipulation skills. The LLMs-based planning is enhanced with a new robotic +dataset and re-plan, called RoboGPT. The new robotic dataset of 67k daily +instruction tasks is gathered for fine-tuning the Llama model and obtaining +RoboGPT. RoboGPT planner with strong generalization can plan hundreds of daily +instruction tasks. Additionally, a low-computational Re-Plan module is designed +to allow plans to flexibly adapt to the environment, thereby addressing the +nomenclature diversity challenge. The proposed RoboGPT agent outperforms SOTA +methods on the ALFRED daily tasks. Moreover, RoboGPT planner exceeds SOTA +LLM-based planners like ChatGPT in task-planning rationality for hundreds of +unseen daily tasks, and even other domain tasks, while keeping the large +model's original broad application and generality. + +
+
+
+
+
+ + ♻ ☆ Two Trades is not Baffled: Condensing Graph via Crafting Rational + Gradient Matching + + +
+ Training on large-scale graphs has achieved remarkable results in graph +representation learning, but its cost and storage have raised growing concerns. +As one of the most promising directions, graph condensation methods address +these issues by employing gradient matching, aiming to condense the full graph +into a more concise yet information-rich synthetic set. Though encouraging, +these strategies primarily emphasize matching directions of the gradients, +which leads to deviations in the training trajectories. Such deviations are +further magnified by the differences between the condensation and evaluation +phases, culminating in accumulated errors, which detrimentally affect the +performance of the condensed graphs. In light of this, we propose a novel graph +condensation method named \textbf{C}raf\textbf{T}ing \textbf{R}ationa\textbf{L} +trajectory (\textbf{CTRL}), which offers an optimized starting point closer to +the original dataset's feature distribution and a more refined strategy for +gradient matching. Theoretically, CTRL can effectively neutralize the impact of +accumulated errors on the performance of condensed graphs. We provide extensive +experiments on various graph datasets and downstream tasks to support the +effectiveness of CTRL. Code is released at +https://github.com/NUS-HPC-AI-Lab/CTRL. + +
+
+ comment: An effective method for graph condensation +
+
+
+
+
+ + ♻ ☆ Curated LLM: Synergy of LLMs and Data Curation for tabular augmentation + in low-data regimes ICML + + +
+ Machine Learning (ML) in low-data settings remains an underappreciated yet +crucial problem. Hence, data augmentation methods to increase the sample size +of datasets needed for ML are key to unlocking the transformative potential of +ML in data-deprived regions and domains. Unfortunately, the limited training +set constrains traditional tabular synthetic data generators in their ability +to generate a large and diverse augmented dataset needed for ML tasks. To +address this challenge, we introduce CLLM, which leverages the prior knowledge +of Large Language Models (LLMs) for data augmentation in the low-data regime. +However, not all the data generated by LLMs will improve downstream utility, as +for any generative model. Consequently, we introduce a principled curation +mechanism, leveraging learning dynamics, coupled with confidence and +uncertainty metrics, to obtain a high-quality dataset. Empirically, on multiple +real-world datasets, we demonstrate the superior performance of CLLM in the +low-data regime compared to conventional generators. Additionally, we provide +insights into the LLM generation and curation mechanism, shedding light on the +features that enable them to output high-quality augmented datasets. + +
+
+ comment: Presented at the 41st International Conference on Machine Learning + (ICML) 2024. *Seedat & Huynh contributed equally +
+
+
+
+
+ + ♻ ☆ Scaffold Splits Overestimate Virtual Screening Performance + + +
+ Virtual Screening (VS) of vast compound libraries guided by Artificial +Intelligence (AI) models is a highly productive approach to early drug +discovery. Data splitting is crucial for better benchmarking of such AI models. +Traditional random data splits produce similar molecules between training and +test sets, conflicting with the reality of VS libraries which mostly contain +structurally distinct compounds. Scaffold split, grouping molecules by shared +core structure, is widely considered to reflect this real-world scenario. +However, here we show that the scaffold split also overestimates VS +performance. The reason is that molecules with different chemical scaffolds are +often similar, which hence introduces unrealistically high similarities between +training molecules and test molecules following a scaffold split. Our study +examined three representative AI models on 60 NCI-60 datasets, each with +approximately 30,000 to 50,000 molecules tested on a different cancer cell +line. Each dataset was split with three methods: scaffold, Butina clustering +and the more accurate Uniform Manifold Approximation and Projection (UMAP) +clustering. Regardless of the model, model performance is much worse with UMAP +splits from the results of the 2100 models trained and evaluated for each +algorithm and split. These robust results demonstrate the need for more +realistic data splits to tune, compare, and select models for VS. For the same +reason, avoiding the scaffold split is also recommended for other molecular +property prediction problems. The code to reproduce these results is available +at https://github.com/ScaffoldSplitsOverestimateVS + +
+
+
+
+
+ + ♻ ☆ Interpretable Multi-task Learning with Shared Variable Embeddings + + +
+ This paper proposes a general interpretable predictive system with shared +information. The system is able to perform predictions in a multi-task setting +where distinct tasks are not bound to have the same input/output structure. +Embeddings of input and output variables in a common space are obtained, where +the input embeddings are produced through attending to a set of shared +embeddings, reused across tasks. All the embeddings are treated as model +parameters and learned. Specific restrictions on the space of shared embedings +and the sparsity of the attention mechanism are considered. Experiments show +that the introduction of shared embeddings does not deteriorate the results +obtained from a vanilla variable embeddings method. We run a number of further +ablations. Inducing sparsity in the attention mechanism leads to both an +increase in accuracy and a significant decrease in the number of training steps +required. Shared embeddings provide a measure of interpretability in terms of +both a qualitative assessment and the ability to map specific shared embeddings +to pre-defined concepts that are not tailored to the considered model. There +seems to be a trade-off between accuracy and interpretability. The basic shared +embeddings method favors interpretability, whereas the sparse attention method +promotes accuracy. The results lead to the conclusion that variable embedding +methods may be extended with shared information to provide increased +interpretability and accuracy. + +
+
+
+
+
+ + ♻ ☆ Partially Observable Stochastic Games with Neural Perception Mechanisms + + +
+ Stochastic games are a well established model for multi-agent sequential +decision making under uncertainty. In practical applications, though, agents +often have only partial observability of their environment. Furthermore, agents +increasingly perceive their environment using data-driven approaches such as +neural networks trained on continuous data. We propose the model of +neuro-symbolic partially-observable stochastic games (NS-POSGs), a variant of +continuous-space concurrent stochastic games that explicitly incorporates +neural perception mechanisms. We focus on a one-sided setting with a +partially-informed agent using discrete, data-driven observations and another, +fully-informed agent. We present a new method, called one-sided NS-HSVI, for +approximate solution of one-sided NS-POSGs, which exploits the piecewise +constant structure of the model. Using neural network pre-image analysis to +construct finite polyhedral representations and particle-based representations +for beliefs, we implement our approach and illustrate its practical +applicability to the analysis of pedestrian-vehicle and pursuit-evasion +scenarios. + +
+
+ comment: 42 pages, 6 figures. Extended version of paper to be published in FM + 2024 +
+
+
+
+
+ + ♻ ☆ Dynamic Relative Representations for Goal-Oriented Semantic + Communications + + +
+ In future 6G wireless networks, semantic and effectiveness aspects of +communications will play a fundamental role, incorporating meaning and +relevance into transmissions. However, obstacles arise when devices employ +diverse languages, logic, or internal representations, leading to semantic +mismatches that might jeopardize understanding. In latent space communication, +this challenge manifests as misalignment within high-dimensional +representations where deep neural networks encode data. This paper presents a +novel framework for goal-oriented semantic communication, leveraging relative +representations to mitigate semantic mismatches via latent space alignment. We +propose a dynamic optimization strategy that adapts relative representations, +communication parameters, and computation resources for energy-efficient, +low-latency, goal-oriented semantic communications. Numerical results +demonstrate our methodology's effectiveness in mitigating mismatches among +devices, while optimizing energy consumption, delay, and effectiveness. + +
+
+
+
+
+ + ♻ ☆ Plum: Prompt Learning using Metaheuristic ACL 2024 + + +
+ Since the emergence of large language models, prompt learning has become a +popular method for optimizing and customizing these models. Special prompts, +such as Chain-of-Thought, have even revealed previously unknown reasoning +capabilities within these models. However, the progress of discovering +effective prompts has been slow, driving a desire for general prompt +optimization methods. Unfortunately, few existing prompt learning methods +satisfy the criteria of being truly "general", i.e., automatic, discrete, +black-box, gradient-free, and interpretable all at once. In this paper, we +introduce metaheuristics, a branch of discrete non-convex optimization methods +with over 100 options, as a promising approach to prompt learning. Within our +paradigm, we test six typical methods: hill climbing, simulated annealing, +genetic algorithms with/without crossover, tabu search, and harmony search, +demonstrating their effectiveness in white-box and black-box prompt learning. +Furthermore, we show that these methods can be used to discover more +human-understandable prompts that were previously unknown in both reasoning and +image generation tasks, opening the door to a cornucopia of possibilities in +prompt optimization. We release all the codes in +\url{https://github.com/research4pan/Plum}. + +
+
+ comment: Published at Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ AdaTreeFormer: Few Shot Domain Adaptation for Tree Counting from a + Single High-Resolution Image SP + + +
+ The process of estimating and counting tree density using only a single +aerial or satellite image is a difficult task in the fields of photogrammetry +and remote sensing. However, it plays a crucial role in the management of +forests. The huge variety of trees in varied topography severely hinders tree +counting models to perform well. The purpose of this paper is to propose a +framework that is learnt from the source domain with sufficient labeled trees +and is adapted to the target domain with only a limited number of labeled +trees. Our method, termed as AdaTreeFormer, contains one shared encoder with a +hierarchical feature extraction scheme to extract robust features from the +source and target domains. It also consists of three subnets: two for +extracting self-domain attention maps from source and target domains +respectively and one for extracting cross-domain attention maps. For the +latter, an attention-to-adapt mechanism is introduced to distill relevant +information from different domains while generating tree density maps; a +hierarchical cross-domain feature alignment scheme is proposed that +progressively aligns the features from the source and target domains. We also +adopt adversarial learning into the framework to further reduce the gap between +source and target domains. Our AdaTreeFormer is evaluated on six designed +domain adaptation tasks using three tree counting datasets, \ie Jiangsu, +Yosemite, and London. Experimental results show that AdaTreeFormer +significantly surpasses the state of the art, \eg in the cross domain from the +Yosemite to Jiangsu dataset, it achieves a reduction of 15.9 points in terms of +the absolute counting errors and an increase of 10.8\% in the accuracy of the +detected trees' locations. The codes and datasets are available at +https://github.com/HAAClassic/AdaTreeFormer. + +
+
+ comment: Accepted in ISPRS Journal of Photogrammetry and Remote Sensing +
+
+
+
+
+ + ♻ ☆ Reconciling Spatial and Temporal Abstractions for Goal Representation + + +
+ Goal representation affects the performance of Hierarchical Reinforcement +Learning (HRL) algorithms by decomposing the complex learning problem into +easier subtasks. Recent studies show that representations that preserve +temporally abstract environment dynamics are successful in solving difficult +problems and provide theoretical guarantees for optimality. These methods +however cannot scale to tasks where environment dynamics increase in complexity +i.e. the temporally abstract transition relations depend on larger number of +variables. On the other hand, other efforts have tried to use spatial +abstraction to mitigate the previous issues. Their limitations include +scalability to high dimensional environments and dependency on prior knowledge. + In this paper, we propose a novel three-layer HRL algorithm that introduces, +at different levels of the hierarchy, both a spatial and a temporal goal +abstraction. We provide a theoretical study of the regret bounds of the learned +policies. We evaluate the approach on complex continuous control tasks, +demonstrating the effectiveness of spatial and temporal abstractions learned by +this approach. Find open-source code at https://github.com/cosynus-lix/STAR. + +
+
+
+
+
+ + ♻ ☆ Machine Learning for Synthetic Data Generation: A Review + + +
+ Machine learning heavily relies on data, but real-world applications often +encounter various data-related issues. These include data of poor quality, +insufficient data points leading to under-fitting of machine learning models, +and difficulties in data access due to concerns surrounding privacy, safety, +and regulations. In light of these challenges, the concept of synthetic data +generation emerges as a promising alternative that allows for data sharing and +utilization in ways that real-world data cannot facilitate. This paper presents +a comprehensive systematic review of existing studies that employ machine +learning models for the purpose of generating synthetic data. The review +encompasses various perspectives, starting with the applications of synthetic +data generation, spanning computer vision, speech, natural language processing, +healthcare, and business domains. Additionally, it explores different machine +learning methods, with particular emphasis on neural network architectures and +deep generative models. The paper also addresses the crucial aspects of privacy +and fairness concerns related to synthetic data generation. Furthermore, this +study identifies the challenges and opportunities prevalent in this emerging +field, shedding light on the potential avenues for future research. By delving +into the intricacies of synthetic data generation, this paper aims to +contribute to the advancement of knowledge and inspire further exploration in +synthetic data generation. + +
+
+
+
+
+ + ♻ ☆ Heterophily-Aware Graph Attention Network + + +
+ Graph Neural Networks (GNNs) have shown remarkable success in graph +representation learning. Unfortunately, current weight assignment schemes in +standard GNNs, such as the calculation based on node degrees or pair-wise +representations, can hardly be effective in processing the networks with +heterophily, in which the connected nodes usually possess different labels or +features. Existing heterophilic GNNs tend to ignore the modeling of heterophily +of each edge, which is also a vital part in tackling the heterophily problem. +In this paper, we firstly propose a heterophily-aware attention scheme and +reveal the benefits of modeling the edge heterophily, i.e., if a GNN assigns +different weights to edges according to different heterophilic types, it can +learn effective local attention patterns, which enable nodes to acquire +appropriate information from distinct neighbors. Then, we propose a novel +Heterophily-Aware Graph Attention Network (HA-GAT) by fully exploring and +utilizing the local distribution as the underlying heterophily, to handle the +networks with different homophily ratios. To demonstrate the effectiveness of +the proposed HA-GAT, we analyze the proposed heterophily-aware attention scheme +and local distribution exploration, by seeking for an interpretation from their +mechanism. Extensive results demonstrate that our HA-GAT achieves +state-of-the-art performances on eight datasets with different homophily ratios +in both the supervised and semi-supervised node classification tasks. + +
+
+ comment: Accepted by Pattern Recognition +
+
+
+
+
+ + ♻ ☆ AB-Training: A Communication-Efficient Approach for Distributed Low-Rank + Learning + + +
+ Communication bottlenecks severely hinder the scalability of distributed +neural network training, particularly in high-performance computing (HPC) +environments. We introduce AB-training, a novel data-parallel method that +leverages low-rank representations and independent training groups to +significantly reduce communication overhead. Our experiments demonstrate an +average reduction in network traffic of approximately 70.31\% across various +scaling scenarios, increasing the training potential of +communication-constrained systems and accelerating convergence at scale. +AB-training also exhibits a pronounced regularization effect at smaller scales, +leading to improved generalization while maintaining or even reducing training +time. We achieve a remarkable 44.14 : 1 compression ratio on VGG16 trained on +CIFAR-10 with minimal accuracy loss, and outperform traditional data parallel +training by 1.55\% on ResNet-50 trained on ImageNet-2012. While AB-training is +promising, our findings also reveal that large batch effects persist even in +low-rank regimes, underscoring the need for further research into optimized +update mechanisms for massively distributed training. + +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for automated image data annotation: empirical + studies using text prompts from Grounding DINO + + +
+ Grounding DINO and the Segment Anything Model (SAM) have achieved impressive +performance in zero-shot object detection and image segmentation, respectively. +Together, they have a great potential to revolutionize applications in +zero-shot semantic segmentation or data annotation. Yet, in specialized domains +like medical image segmentation, objects of interest (e.g., organs, tissues, +and tumors) may not fall in existing class names. To address this problem, the +referring expression comprehension (REC) ability of Grounding DINO is leveraged +to detect arbitrary targets by their language descriptions. However, recent +studies have highlighted severe limitation of the REC framework in this +application setting owing to its tendency to make false positive predictions +when the target is absent in the given image. And, while this bottleneck is +central to the prospect of open-set semantic segmentation, it is still largely +unknown how much improvement can be achieved by studying the prediction errors. +To this end, we perform empirical studies on six publicly available datasets +across different domains and reveal that these errors consistently follow a +predictable pattern and can, thus, be mitigated by a simple strategy. +Specifically, we show that false positive detections with appreciable +confidence scores generally occupy large image areas and can usually be +filtered by their relative sizes. More importantly, we expect these +observations to inspire future research in improving REC-based detection and +automated segmentation. Meanwhile, we evaluate the performance of SAM on +multiple datasets from various specialized domains and report significant +improvements in segmentation performance and annotation time savings over +manual approaches. + +
+
+
+
+
+ + ♻ ☆ Verifying the Generalization of Deep Learning to Out-of-Distribution + Domains + + +
+ Deep neural networks (DNNs) play a crucial role in the field of machine +learning, demonstrating state-of-the-art performance across various application +domains. However, despite their success, DNN-based models may occasionally +exhibit challenges with generalization, i.e., may fail to handle inputs that +were not encountered during training. This limitation is a significant +challenge when it comes to deploying deep learning for safety-critical tasks, +as well as in real-world settings characterized by substantial variability. We +introduce a novel approach for harnessing DNN verification technology to +identify DNN-driven decision rules that exhibit robust generalization to +previously unencountered input domains. Our method assesses generalization +within an input domain by measuring the level of agreement between +independently trained deep neural networks for inputs in this domain. We also +efficiently realize our approach by using off-the-shelf DNN verification +engines, and extensively evaluate it on both supervised and unsupervised DNN +benchmarks, including a deep reinforcement learning (DRL) system for Internet +congestion control -- demonstrating the applicability of our approach for +real-world settings. Moreover, our research introduces a fresh objective for +formal verification, offering the prospect of mitigating the challenges linked +to deploying DNN-driven systems in real-world scenarios. + +
+
+ comment: To appear in the Journal of Automated Reasoning (JAR), 2024. This is + an extended version of a CAV 2023 paper, titled: "Verifying Generalization in + Deep Learning" +
+
+
+
+
+ + ♻ ☆ Time Series Diffusion Method: A Denoising Diffusion Probabilistic Model + for Vibration Signal Generation + + +
+ Diffusion models have demonstrated powerful data generation capabilities in +various research fields such as image generation. However, in the field of +vibration signal generation, the criteria for evaluating the quality of the +generated signal are different from that of image generation and there is a +fundamental difference between them. At present, there is no research on the +ability of diffusion model to generate vibration signal. In this paper, a Time +Series Diffusion Method (TSDM) is proposed for vibration signal generation, +leveraging the foundational principles of diffusion models. The TSDM uses an +improved U-net architecture with attention block, ResBlock and TimeEmbedding to +effectively segment and extract features from one-dimensional time series data. +It operates based on forward diffusion and reverse denoising processes for +time-series generation. Experimental validation is conducted using +single-frequency, multi-frequency datasets, and bearing fault datasets. The +results show that TSDM can accurately generate the single-frequency and +multi-frequency features in the time series and retain the basic frequency +features for the diffusion generation results of the bearing fault series. It +is also found that the original DDPM could not generate high quality vibration +signals, but the improved U-net in TSDM, which applied the combination of +attention block and ResBlock, could effectively improve the quality of +vibration signal generation. Finally, TSDM is applied to the small sample fault +diagnosis of three public bearing fault datasets, and the results show that the +accuracy of small sample fault diagnosis of the three datasets is improved by +32.380%, 18.355% and 9.298% at most, respectively. + +
+
+
+
+
+ + ♻ ☆ Topology-aware Embedding Memory for Continual Learning on Expanding + Networks KDD 2024 + + +
+ Memory replay based techniques have shown great success for continual +learning with incrementally accumulated Euclidean data. Directly applying them +to continually expanding networks, however, leads to the potential memory +explosion problem due to the need to buffer representative nodes and their +associated topological neighborhood structures. To this end, we systematically +analyze the key challenges in the memory explosion problem, and present a +general framework, \textit{i.e.}, Parameter Decoupled Graph Neural Networks +(PDGNNs) with Topology-aware Embedding Memory (TEM), to tackle this issue. The +proposed framework not only reduces the memory space complexity from +$\mathcal{O}(nd^L)$ to $\mathcal{O}(n)$~\footnote{$n$: memory budget, $d$: +average node degree, $L$: the radius of the GNN receptive field}, but also +fully utilizes the topological information for memory replay. Specifically, +PDGNNs decouple trainable parameters from the computation ego-subnetwork via +\textit{Topology-aware Embeddings} (TEs), which compress ego-subnetworks into +compact vectors (\textit{i.e.}, TEs) to reduce the memory consumption. Based on +this framework, we discover a unique \textit{pseudo-training effect} in +continual learning on expanding networks and this effect motivates us to +develop a novel \textit{coverage maximization sampling} strategy that can +enhance the performance with a tight memory budget. Thorough empirical studies +demonstrate that, by tackling the memory explosion problem and incorporating +topological information into memory replay, PDGNNs with TEM significantly +outperform state-of-the-art techniques, especially in the challenging +class-incremental setting. + +
+
+ comment: This paper has been accepted by KDD 2024 +
+
+
+
+
+ + ♻ ☆ Uncertainty-Aware Reward-Free Exploration with General Function + Approximation ICML 2024 + + +
+ Mastering multiple tasks through exploration and learning in an environment +poses a significant challenge in reinforcement learning (RL). Unsupervised RL +has been introduced to address this challenge by training policies with +intrinsic rewards rather than extrinsic rewards. However, current intrinsic +reward designs and unsupervised RL algorithms often overlook the heterogeneous +nature of collected samples, thereby diminishing their sample efficiency. To +overcome this limitation, in this paper, we propose a reward-free RL algorithm +called \alg. The key idea behind our algorithm is an uncertainty-aware +intrinsic reward for exploring the environment and an uncertainty-weighted +learning process to handle heterogeneous uncertainty in different samples. +Theoretically, we show that in order to find an $\epsilon$-optimal policy, +GFA-RFE needs to collect $\tilde{O} (H^2 \log N_{\mathcal F} (\epsilon) +\mathrm{dim} (\mathcal F) / \epsilon^2 )$ number of episodes, where $\mathcal +F$ is the value function class with covering number $N_{\mathcal F} (\epsilon)$ +and generalized eluder dimension $\mathrm{dim} (\mathcal F)$. Such a result +outperforms all existing reward-free RL algorithms. We further implement and +evaluate GFA-RFE across various domains and tasks in the DeepMind Control +Suite. Experiment results show that GFA-RFE outperforms or is comparable to the +performance of state-of-the-art unsupervised RL algorithms. + +
+
+ comment: 32 pages, 5 figures, 4 tables, accepted by ICML 2024 +
+
+
+
+
+ + ♻ ☆ Forecasting the Forced van der Pol Equation with Frequent Phase Shifts + Using Reservoir Computing + + +
+ We tested the performance of reservoir computing (RC) in predicting the +dynamics of a certain non-autonomous dynamical system. Specifically, we +considered a van del Pol oscillator subjected to periodic external force with +frequent phase shifts. The reservoir computer, which was trained and optimized +with simulation data generated for a particular phase shift, was designed to +predict the oscillation dynamics under periodic external forces with different +phase shifts. The results suggest that if the training data have some +complexity, it is possible to quantitatively predict the oscillation dynamics +exposed to different phase shifts. The setting of this study was motivated by +the problem of predicting the state of the circadian rhythm of shift workers +and designing a better shift work schedule for each individual. Our results +suggest that RC could be exploited for such applications. + +
+
+
+
+
+ + ♻ ☆ Continual Learning of Large Language Models: A Comprehensive Survey + + +
+ The recent success of large language models (LLMs) trained on static, +pre-collected, general datasets has sparked numerous research directions and +applications. One such direction addresses the non-trivial challenge of +integrating pre-trained LLMs into dynamic data distributions, task structures, +and user preferences. Pre-trained LLMs, when tailored for specific needs, often +experience significant performance degradation in previous knowledge domains -- +a phenomenon known as "catastrophic forgetting". While extensively studied in +the continual learning (CL) community, it presents new manifestations in the +realm of LLMs. In this survey, we provide a comprehensive overview of the +current research progress on LLMs within the context of CL. This survey is +structured into four main sections: we first describe an overview of +continually learning LLMs, consisting of two directions of continuity: vertical +continuity (or vertical continual learning), i.e., continual adaptation from +general to specific capabilities, and horizontal continuity (or horizontal +continual learning), i.e., continual adaptation across time and domains +(Section 3). We then summarize three stages of learning LLMs in the context of +modern CL: Continual Pre-Training (CPT), Domain-Adaptive Pre-training (DAP), +and Continual Fine-Tuning (CFT) (Section 4). Then we provide an overview of +evaluation protocols for continual learning with LLMs, along with the current +available data sources (Section 5). Finally, we discuss intriguing questions +pertaining to continual learning for LLMs (Section 6). The full list of papers +examined in this survey is available at +https://github.com/Wang-ML-Lab/llm-continual-learning-survey. + +
+
+ comment: 47 pages, 2 figures, 4 tables. Work in progress +
+
+
+
+
+ + ♻ ☆ Compact Proofs of Model Performance via Mechanistic Interpretability ICML + + +
+ In this work, we propose using mechanistic interpretability -- techniques for +reverse engineering model weights into human-interpretable algorithms -- to +derive and compactly prove formal guarantees on model performance. We prototype +this approach by formally proving lower bounds on the accuracy of 151 small +transformers trained on a Max-of-$K$ task. We create 102 different +computer-assisted proof strategies and assess their length and tightness of +bound on each of our models. Using quantitative metrics, we find that shorter +proofs seem to require and provide more mechanistic understanding. Moreover, we +find that more faithful mechanistic understanding leads to tighter performance +bounds. We confirm these connections by qualitatively examining a subset of our +proofs. Finally, we identify compounding structureless noise as a key challenge +for using mechanistic interpretability to generate compact proofs on model +performance. + +
+
+ comment: accepted to 2024 ICML MI Workshop (Spotlight) +
+
+
+
+
+ + ♻ ☆ Counterfactual Fairness through Transforming Data Orthogonal to Bias + + +
+ Machine learning models have shown exceptional prowess in solving complex +issues across various domains. However, these models can sometimes exhibit +biased decision-making, resulting in unequal treatment of different groups. +Despite substantial research on counterfactual fairness, methods to reduce the +impact of multivariate and continuous sensitive variables on decision-making +outcomes are still underdeveloped. We propose a novel data pre-processing +algorithm, Orthogonal to Bias (OB), which is designed to eliminate the +influence of a group of continuous sensitive variables, thus promoting +counterfactual fairness in machine learning applications. Our approach, based +on the assumption of a jointly normal distribution within a structural causal +model (SCM), demonstrates that counterfactual fairness can be achieved by +ensuring the data is orthogonal to the observed sensitive variables. The OB +algorithm is model-agnostic, making it applicable to a wide range of machine +learning models and tasks. Additionally, it includes a sparse variant to +improve numerical stability through regularization. Empirical evaluations on +both simulated and real-world datasets, encompassing settings with both +discrete and continuous sensitive variables, show that our methodology +effectively promotes fairer outcomes without compromising accuracy. + +
+
+
+
+
+ + ♻ ☆ Biomedical Visual Instruction Tuning with Clinician Preference Alignment + + +
+ Recent advancements in multimodal foundation models have showcased impressive +capabilities in understanding and reasoning with visual and textual +information. Adapting these foundation models trained for general usage to +specialized domains like biomedicine requires large-scale domain-specific +instruction datasets. While existing works have explored curating such datasets +automatically, the resultant datasets are not explicitly aligned with domain +expertise. In this work, we propose a data-centric framework, Biomedical Visual +Instruction Tuning with Clinician Preference Alignment (BioMed-VITAL), that +incorporates clinician preferences into both stages of generating and selecting +instruction data for tuning biomedical multimodal foundation models. First, +during the generation stage, we prompt the GPT-4V generator with a diverse set +of clinician-selected demonstrations for preference-aligned data candidate +generation. Then, during the selection phase, we train a separate selection +model, which explicitly distills clinician and policy-guided model preferences +into a rating function to select high-quality data for medical instruction +tuning. Results show that the model tuned with the instruction-following data +from our method demonstrates a significant improvement in open visual chat +(18.5% relatively) and medical VQA (win rate up to 81.73%). Our +instruction-following data and models are available at BioMed-VITAL.github.io. + +
+
+
+
+
+ + ♻ ☆ A Mechanistic Analysis of a Transformer Trained on a Symbolic Multi-Step + Reasoning Task + + +
+ Transformers demonstrate impressive performance on a range of reasoning +benchmarks. To evaluate the degree to which these abilities are a result of +actual reasoning, existing work has focused on developing sophisticated +benchmarks for behavioral studies. However, these studies do not provide +insights into the internal mechanisms driving the observed capabilities. To +improve our understanding of the internal mechanisms of transformers, we +present a comprehensive mechanistic analysis of a transformer trained on a +synthetic reasoning task. We identify a set of interpretable mechanisms the +model uses to solve the task, and validate our findings using correlational and +causal evidence. Our results suggest that it implements a depth-bounded +recurrent mechanisms that operates in parallel and stores intermediate results +in selected token positions. We anticipate that the motifs we identified in our +synthetic setting can provide valuable insights into the broader operating +principles of transformers and thus provide a basis for understanding more +complex models. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Revisiting Vision-Language Features Adaptation and Inconsistency for + Social Media Popularity Prediction + + +
+ Social media popularity (SMP) prediction is a complex task involving +multi-modal data integration. While pre-trained vision-language models (VLMs) +like CLIP have been widely adopted for this task, their effectiveness in +capturing the unique characteristics of social media content remains +unexplored. This paper critically examines the applicability of CLIP-based +features in SMP prediction, focusing on the overlooked phenomenon of semantic +inconsistency between images and text in social media posts. Through extensive +analysis, we demonstrate that this inconsistency increases with post +popularity, challenging the conventional use of VLM features. We provide a +comprehensive investigation of semantic inconsistency across different +popularity intervals and analyze the impact of VLM feature adaptation on SMP +tasks. Our experiments reveal that incorporating inconsistency measures and +adapted text features significantly improves model performance, achieving an +SRC of 0.729 and an MAE of 1.227. These findings not only enhance SMP +prediction accuracy but also provide crucial insights for developing more +targeted approaches in social media analysis. + +
+
+ comment: Submission of the 7th Social Media Prediction Challenge +
+
+
+
+
+ + ♻ ☆ AIM: Let Any Multi-modal Large Language Models Embrace Efficient + In-Context Learning + + +
+ In-context learning (ICL) facilitates Large Language Models (LLMs) exhibiting +emergent ability on downstream tasks without updating billions of parameters. +However, in the area of multi-modal Large Language Models (MLLMs), two problems +hinder the application of multi-modal ICL: (1) Most primary MLLMs are only +trained on single-image datasets, making them unable to read multi-modal +demonstrations. (2) With the demonstrations increasing, thousands of visual +tokens highly challenge hardware and degrade ICL performance. During +preliminary explorations, we discovered that the inner LLM tends to focus more +on the linguistic modality within multi-modal demonstrations to generate +responses. Therefore, we propose a general and light-weighted framework +\textbf{AIM} to tackle the mentioned problems through \textbf{A}ggregating +\textbf{I}mage information of \textbf{M}ultimodal demonstrations to the dense +latent space of the corresponding linguistic part. Specifically, AIM first uses +the frozen backbone MLLM to read each image-text demonstration and extracts the +vector representations on top of the text. These vectors naturally fuse the +information of the image-text pair, and AIM transforms them into fused virtual +tokens acceptable for the inner LLM via a trainable projection layer. +Ultimately, these fused tokens function as variants of multi-modal +demonstrations, fed into the MLLM to direct its response to the current query +as usual. Because these fused tokens stem from the textual component of the +image-text pair, a multi-modal demonstration is nearly reduced to a pure +textual demonstration, thus seamlessly applying to any MLLMs. With its de facto +MLLM frozen, AIM is parameter-efficient and we train it on public multi-modal +web corpora which have nothing to do with downstream test tasks. + +
+
+
+
+
+ + ♻ ☆ WebXR, A-Frame and Networked-Aframe as a Basis for an Open Metaverse: A + Conceptual Architecture + + +
+ This work proposes a WebXR-based cross-platform conceptual architecture, +leveraging the A-Frame and Networked-Aframe frameworks, in order to facilitate +the development of an open, accessible, and interoperable metaverse. By +introducing the concept of spatial web app, this research contributes to the +discourse on the metaverse, offering an architecture that democratizes access +to virtual environments and extended reality through the web, and aligns with +Tim Berners-Lee's original vision of the World Wide Web as an open platform in +the digital realm. + +
+
+ comment: draftcls option +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`